From richiehindle at users.sourceforge.net Sun Jan 5 14:17:16 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Sun Jan 5 17:17:21 2003 Subject: [Spambayes-checkins] spambayes storage.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv25359 Modified Files: storage.py Log Message: Use binary pickles by default, to save space (and probably time). Index: storage.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/storage.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** storage.py 3 Dec 2002 20:11:18 -0000 1.6 --- storage.py 5 Jan 2003 22:17:13 -0000 1.7 *************** *** 54,57 **** --- 54,63 ---- import dbmstorage + # Make shelve use binary pickles by default. + oldShelvePickler = shelve.Pickler + def binaryDefaultPickler(f, binary=1): + return oldShelvePickler(f, binary) + shelve.Pickler = binaryDefaultPickler + PICKLE_TYPE = 1 NO_UPDATEPROBS = False # Probabilities will not be autoupdated with training From richiehindle at users.sourceforge.net Sun Jan 5 14:24:16 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Sun Jan 5 17:24:19 2003 Subject: [Spambayes-checkins] spambayes mboxtest.py,1.10,1.11 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv27492 Modified Files: mboxtest.py Log Message: 'Msg' lives in msgs.py, not timtest.py. Thanks to whoever pointed this out (I can't find their message in the archive right now). Index: mboxtest.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mboxtest.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** mboxtest.py 7 Nov 2002 22:30:07 -0000 1.10 --- mboxtest.py 5 Jan 2003 22:24:14 -0000 1.11 *************** *** 30,34 **** from tokenizer import tokenize from TestDriver import Driver ! from timtest import Msg from Options import options --- 30,34 ---- from tokenizer import tokenize from TestDriver import Driver ! from msgs import Msg from Options import options From richiehindle at users.sourceforge.net Sun Jan 5 14:26:23 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Sun Jan 5 17:26:26 2003 Subject: [Spambayes-checkins] spambayes Corpus.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv28160 Modified Files: Corpus.py Log Message: Hack around a nasty problem in Corpus whereby the web training interface would crash when there were more than 250 messages in the list. A real fix is pending, just as soon as work out what it should be... Index: Corpus.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Corpus.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** Corpus.py 6 Dec 2002 12:56:51 -0000 1.7 --- Corpus.py 5 Jan 2003 22:26:20 -0000 1.8 *************** *** 184,188 **** --- 184,193 ---- '''Move a Message from another corpus to this corpus''' + # XXX Hack: Calling msg.getSubstance() here ensures that the + # message substance is in memory. If it isn't, when addMessage() + # calls message.store(), which calls message.getSubstance(), that + # will try to load the substance from the as-yet-unwritten new file. msg = fromcorpus[key] + msg.getSubstance() fromcorpus.removeMessage(msg) self.addMessage(msg) From richiehindle at users.sourceforge.net Sun Jan 5 14:32:48 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Sun Jan 5 17:32:54 2003 Subject: [Spambayes-checkins] spambayes storage.py,1.7,1.8 hammiebulk.py,1.5,1.6 Corpus.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv29798 Modified Files: storage.py hammiebulk.py Corpus.py Log Message: Added True, False and bool for 2.2-compatibility (thanks to François Granger). Index: storage.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/storage.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** storage.py 5 Jan 2003 22:17:13 -0000 1.7 --- storage.py 5 Jan 2003 22:32:45 -0000 1.8 *************** *** 47,50 **** --- 47,58 ---- __credits__ = "All the spambayes contributors." + try: + True, False + except NameError: + # Maintain compatibility with Python 2.2 + True, False = 1, 0 + def bool(val): + return not not val + import classifier from Options import options *************** *** 70,74 **** classifier.Classifier.__init__(self) self.db_name = db_name ! self.load() def load(self): --- 78,82 ---- classifier.Classifier.__init__(self) self.db_name = db_name ! self.load() def load(self): Index: hammiebulk.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiebulk.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** hammiebulk.py 2 Dec 2002 04:43:37 -0000 1.5 --- hammiebulk.py 5 Jan 2003 22:32:46 -0000 1.6 *************** *** 38,41 **** --- 38,49 ---- """ + try: + True, False + except NameError: + # Maintain compatibility with Python 2.2 + True, False = 1, 0 + def bool(val): + return not not val + import sys import os Index: Corpus.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Corpus.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** Corpus.py 5 Jan 2003 22:26:20 -0000 1.8 --- Corpus.py 5 Jan 2003 22:32:46 -0000 1.9 *************** *** 83,86 **** --- 83,94 ---- from __future__ import generators + try: + True, False + except NameError: + # Maintain compatibility with Python 2.2 + True, False = 1, 0 + def bool(val): + return not not val + import sys # for output of docstring import time From anthonybaxter at users.sourceforge.net Wed Jan 8 18:13:23 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Wed Jan 8 21:13:27 2003 Subject: [Spambayes-checkins] website related.ht,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv9832 Modified Files: related.ht Log Message: added spambully, updated mozilla Index: related.ht =================================================================== RCS file: /cvsroot/spambayes/website/related.ht,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** related.ht 15 Nov 2002 00:58:12 -0000 1.5 --- related.ht 9 Jan 2003 02:13:21 -0000 1.6 *************** *** 8,12 ****
  • Gary Arnold's bayespam, a perl qmail filter. !
  • The mozilla project is working on this, see bug 163188, or this section on the mozilla website. It looks like they're only using the Graham-style filtering, which is a pity.
  • Eric Raymond's bogofilter, a C code bayesian filter.
  • ifile, a Naive Bayes classification system. --- 8,12 ----
    • Gary Arnold's bayespam, a perl qmail filter. !
    • As of version 1.3, Mozilla Mail now supports Graham-style Bayesian filtering, see the documentation on the mozilla website.
    • Eric Raymond's bogofilter, a C code bayesian filter.
    • ifile, a Naive Bayes classification system. *************** *** 14,18 **** --- 14,20 ----
    • spamoracle, a Paul Graham based spam filter written in OCaml, designed for use with procmail.
    • popfile, a pop3 proxy written in Perl with a Naive Bayes classifier. +
    • Spam Bully is a commercial spam filter that claims to use bayesian techniques.
    +

    (got more? email anthony at interlink.com.au and I'll add links, or correct descriptions.) From npickett at users.sourceforge.net Thu Jan 9 08:39:32 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Thu Jan 9 17:05:04 2003 Subject: [Spambayes-checkins] spambayes HAMMIE.txt,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv17894 Modified Files: HAMMIE.txt Log Message: * Corrected paths Index: HAMMIE.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/HAMMIE.txt,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** HAMMIE.txt 6 Dec 2002 16:14:28 -0000 1.4 --- HAMMIE.txt 9 Jan 2003 16:39:29 -0000 1.5 *************** *** 37,45 **** 2. Create a new database: ! $HOME/src/hammiefilter.py -n 3. (optional) Train it on your existing mail: ! $HOME/src/mboxtrain.py -d $HOME/.hammiedb -g $HOME/Mail/inbox -s $HOME/Mail/spam You can add additional folder names if you like, using -g for "good" --- 37,45 ---- 2. Create a new database: ! $HOME/src/spambayes/hammiefilter.py -n 3. (optional) Train it on your existing mail: ! $HOME/src/spambayes/mboxtrain.py -d $HOME/.hammiedb -g $HOME/Mail/inbox -s $HOME/Mail/spam You can add additional folder names if you like, using -g for "good" *************** *** 49,53 **** :0fw ! | $HOME/src/hammiefilter.py :0 --- 49,53 ---- :0fw ! | $HOME/src/spambayes/hammiefilter.py :0 *************** *** 73,77 **** Linux systems): ! 21 2 * * * $HOME/src/mboxtrain.py -d $HOME/.hammiedb -g $HOME/Mail/inbox -s $HOME/Mail/spam As in step 3, you can add additional folder names here too. It's --- 73,77 ---- Linux systems): ! 21 2 * * * $HOME/src/spambayes/mboxtrain.py -d $HOME/.hammiedb -g $HOME/Mail/inbox -s $HOME/Mail/spam As in step 3, you can add additional folder names here too. It's From mhammond at users.sourceforge.net Thu Jan 9 16:33:12 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Thu Jan 9 19:33:17 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.39,1.40 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv5171 Modified Files: manager.py Log Message: Give the manager an "application_directory" attribute to save everyone duplicating this logic to locate the images, html, etc. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** manager.py 15 Dec 2002 14:03:58 -0000 1.39 --- manager.py 10 Jan 2003 00:33:10 -0000 1.40 *************** *** 18,24 **** True, False = 1, 0 try: if hasattr(sys, "frozen"): ! this_filename = os.path.abspath(sys.argv[0]) else: this_filename = os.path.abspath(__file__) --- 18,32 ---- True, False = 1, 0 + # Work out our "application directory", which is + # the directory of our main .py/.dll/.exe file we + # are running from. try: if hasattr(sys, "frozen"): ! if sys.frozen == "dll": ! import win32api ! this_filename = win32api.GetModuleFileName(sys.frozendllhandle) ! else: ! # Don't think we will ever run as a .EXE, but... ! this_filename = os.path.abspath(sys.argv[0]) else: this_filename = os.path.abspath(__file__) *************** *** 54,59 **** self.addin = None self.verbose = verbose if not os.path.isabs(config_base): ! config_base = os.path.join(os.path.dirname(this_filename), config_base) config_base = os.path.abspath(config_base) --- 62,68 ---- self.addin = None self.verbose = verbose + self.application_directory = os.path.dirname(this_filename) if not os.path.isabs(config_base): ! config_base = os.path.join(self.application_directory, config_base) config_base = os.path.abspath(config_base) From mhammond at users.sourceforge.net Thu Jan 9 16:35:00 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Thu Jan 9 19:35:03 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.43,1.44 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv5656 Modified Files: addin.py Log Message: Fix a crash setting up the toolbar buttons in a second window. Also some changes to use the "application_directory" to locate the images - these are for a standalone DLL version of the addin, which is not yet ready but on its way. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.43 retrieving revision 1.44 diff -C2 -d -r1.43 -r1.44 *** addin.py 15 Dec 2002 14:05:35 -0000 1.43 --- addin.py 10 Jan 2003 00:34:57 -0000 1.44 *************** *** 294,298 **** "Move the selected message to the Spam folder,\n" \ "and train the system that this is Spam." ! SetButtonImage(self, image) def OnClick(self, button, cancel): --- 294,298 ---- "Move the selected message to the Spam folder,\n" \ "and train the system that this is Spam." ! SetButtonImage(self, image, manager) def OnClick(self, button, cancel): *************** *** 332,336 **** "folder is not known), and trains the system that\n" \ "this is a good message\n" ! SetButtonImage(self, image) def OnClick(self, button, cancel): --- 332,336 ---- "folder is not known), and trains the system that\n" \ "this is a good message\n" ! SetButtonImage(self, image, manager) def OnClick(self, button, cancel): *************** *** 361,365 **** # Helpers to work with images on buttons/toolbars. ! def SetButtonImage(button, fname): # whew - http://support.microsoft.com/default.aspx?scid=KB;EN-US;q288771 # shows how to make a transparent bmp. --- 361,365 ---- # Helpers to work with images on buttons/toolbars. ! def SetButtonImage(button, fname, manager): # whew - http://support.microsoft.com/default.aspx?scid=KB;EN-US;q288771 # shows how to make a transparent bmp. *************** *** 367,377 **** # this, we can not simply perform this load once and reuse the image. if not os.path.isabs(fname): ! if hasattr(sys, "frozen"): ! # images relative to the executable. ! fname = os.path.join(os.path.dirname(sys.argv[0]), "images", fname) - else: - # Ensure references are relative to this .py file - fname = os.path.join( os.path.dirname(__file__), "images", fname) if not os.path.isfile(fname): print "WARNING - Trying to use image '%s', but it doesn't exist" % (fname,) --- 367,373 ---- # this, we can not simply perform this load once and reuse the image. if not os.path.isabs(fname): ! # images relative to the application path ! fname = os.path.join(manager.application_directory, "images", fname) if not os.path.isfile(fname): print "WARNING - Trying to use image '%s', but it doesn't exist" % (fname,) *************** *** 506,509 **** --- 502,510 ---- # The Outlook event handlers def OnActivate(self): + # See comments for OnNewExplorer below. + # *sigh* - OnActivate seems too early too :( + pass + + def OnSelectionChange(self): # See comments for OnNewExplorer below. if not self.have_setup_ui: From anthonybaxter at users.sourceforge.net Fri Jan 10 01:06:09 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 04:07:06 2003 Subject: [Spambayes-checkins] spambayes INTEGRATION.txt,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv7577 Modified Files: INTEGRATION.txt Log Message: 2.3a1 is out. Index: INTEGRATION.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/INTEGRATION.txt,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** INTEGRATION.txt 26 Nov 2002 16:20:38 -0000 1.3 --- INTEGRATION.txt 10 Jan 2003 09:06:06 -0000 1.4 *************** *** 50,55 **** You also need version 2.4.3 or above of the Python "email" package. If ! you're running Python 2.3 (which at the time of writing is only available ! from SourceForge CVS) then you already have this. If not, you can download it from http://mimelib.sf.net and install it - unpack the archive, cd to the email-2.4.3 directory and type "python setup.py install" (YMMV on --- 50,56 ---- You also need version 2.4.3 or above of the Python "email" package. If ! you're running Python 2.3 (which at the time of writing is available ! from SourceForge CVS, or as the alpha version 2.3a1, available from ! python.org) then you already have this. If not, you can download it from http://mimelib.sf.net and install it - unpack the archive, cd to the email-2.4.3 directory and type "python setup.py install" (YMMV on From anthonybaxter at users.sourceforge.net Fri Jan 10 01:12:29 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 04:12:32 2003 Subject: [Spambayes-checkins] spambayes/spambayes - New directory Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9472/spambayes Log Message: Directory /cvsroot/spambayes/spambayes/spambayes added to the repository --> Using per-directory sticky tag `reorg-branch' From anthonybaxter at users.sourceforge.net Fri Jan 10 01:18:28 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 04:18:32 2003 Subject: [Spambayes-checkins] spambayes/utilities - New directory Message-ID: Update of /cvsroot/spambayes/spambayes/utilities In directory sc8-pr-cvs1:/tmp/cvs-serv11414/utilities Log Message: Directory /cvsroot/spambayes/spambayes/utilities added to the repository --> Using per-directory sticky tag `reorg-branch' From anthonybaxter at users.sourceforge.net Fri Jan 10 01:21:05 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 04:21:09 2003 Subject: [Spambayes-checkins] spambayes/testtools - New directory Message-ID: Update of /cvsroot/spambayes/spambayes/testtools In directory sc8-pr-cvs1:/tmp/cvs-serv12333/testtools Log Message: Directory /cvsroot/spambayes/spambayes/testtools added to the repository --> Using per-directory sticky tag `reorg-branch' From anthonybaxter at users.sourceforge.net Fri Jan 10 01:24:21 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 04:24:24 2003 Subject: [Spambayes-checkins] spambayes/22compat - New directory Message-ID: Update of /cvsroot/spambayes/spambayes/22compat In directory sc8-pr-cvs1:/tmp/cvs-serv13557/22compat Log Message: Directory /cvsroot/spambayes/spambayes/22compat added to the repository --> Using per-directory sticky tag `reorg-branch' From anthonybaxter at users.sourceforge.net Fri Jan 10 01:30:44 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 04:30:47 2003 Subject: [Spambayes-checkins] spambayes unheader.py,1.8,1.8.4.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv15908 Modified Files: Tag: reorg-branch unheader.py Log Message: docstring. Index: unheader.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/unheader.py,v retrieving revision 1.8 retrieving revision 1.8.4.1 diff -C2 -d -r1.8 -r1.8.4.1 *** unheader.py 27 Oct 2002 22:10:28 -0000 1.8 --- unheader.py 10 Jan 2003 09:30:42 -0000 1.8.4.1 *************** *** 1,3 **** --- 1,11 ---- #!/usr/bin/env python + """ + unheader.py: cleans headers from email messages. By default, this + removes SpamAssassin headers, specify a pattern with -p to supply + new headers to remove. + + This is often needed because existing spamassassin headers can + provide killer spam clues, for all the wrong reasons. + """ import re From anthonybaxter at users.sourceforge.net Fri Jan 10 02:38:20 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 05:38:22 2003 Subject: [Spambayes-checkins] spambayes hammie.py,1.45,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv8840 Removed Files: Tag: reorg-branch hammie.py Log Message: moving hammie module into package. --- hammie.py DELETED --- From anthonybaxter at users.sourceforge.net Fri Jan 10 02:38:50 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 05:38:53 2003 Subject: [Spambayes-checkins] spambayes hammie.py,1.45.2.1,1.45.2.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9003 Added Files: Tag: reorg-branch hammie.py Log Message: replacing hammie module with a tiny wrapper. From anthonybaxter at users.sourceforge.net Fri Jan 10 02:41:09 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 05:41:13 2003 Subject: [Spambayes-checkins] spambayes OptionConfig.py,1.1,1.1.2.1 hammiefilter.py,1.5,1.5.2.1 hammiesrv.py,1.10,1.10.4.1 mailsort.py,1.1,1.1.2.1 mboxtrain.py,1.2,1.2.2.1 pop3graph.py,1.1,1.1.2.1 pop3proxy.py,1.32,1.32.2.1 setup.py,1.10,1.10.2.1 Corpus.py,1.9,NONE CostCounter.py,1.5,NONE FileCorpus.py,1.9,NONE HistToGNU.py,1.7,NONE Histogram.py,1.7,NONE Options.py,1.80,NONE TestDriver.py,1.31,NONE Tester.py,1.9,NONE cdb.py,1.4,NONE chi2.py,1.8,NONE classifier.py,1.62,NONE cmp.py,1.17,NONE dbmstorage.py,1.1,NONE fpfn.py,1.1,NONE hammiebulk.py,1.6,NONE heapq.py,1.1,NONE loosecksum.py,1.4,NONE mboxcount.py,1.3,NONE mboxtest.py,1.11,NONE mboxutils.py,1.7,NONE msgs.py,1.6,NONE optimize.py,1.2,NONE rates.py,1.8,NONE rebal.py,1.9,NONE sets.py,1.2,NONE simplexloop.py,1.2,NONE split.py,1.2,NONE splitn.py,1.4,NONE splitndirs.py,1.7,NONE storage.py,1.8,NONE table.py,1.5,NONE timcv.py,1.12,NONE timtest.py,1.30,NONE tokenizer.py,1.72,NONE weaktest.py,1.6,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9389 Modified Files: Tag: reorg-branch OptionConfig.py hammiefilter.py hammiesrv.py mailsort.py mboxtrain.py pop3graph.py pop3proxy.py setup.py Removed Files: Tag: reorg-branch Corpus.py CostCounter.py FileCorpus.py HistToGNU.py Histogram.py Options.py TestDriver.py Tester.py cdb.py chi2.py classifier.py cmp.py dbmstorage.py fpfn.py hammiebulk.py heapq.py loosecksum.py mboxcount.py mboxtest.py mboxutils.py msgs.py optimize.py rates.py rebal.py sets.py simplexloop.py split.py splitn.py splitndirs.py storage.py table.py timcv.py timtest.py tokenizer.py weaktest.py Log Message: Checkpointing before I head home. Still to do: - distutils magic to make sure that the 22compat modules are installed when needed. - Walking through testtools and utilities and fixing imports. - Documentation. hammie works, everything else that people use in day-to-day operation should work - please give it a go. Index: OptionConfig.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/OptionConfig.py,v retrieving revision 1.1 retrieving revision 1.1.2.1 diff -C2 -d -r1.1 -r1.1.2.1 *** OptionConfig.py 1 Dec 2002 04:11:39 -0000 1.1 --- OptionConfig.py 10 Jan 2003 10:41:05 -0000 1.1.2.1 *************** *** 30,34 **** import SmarterHTTPServer import BaseHTTPServer ! from Options import options import re from cStringIO import StringIO --- 30,34 ---- import SmarterHTTPServer import BaseHTTPServer ! from spambayes.Options import options import re from cStringIO import StringIO Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.5 retrieving revision 1.5.2.1 diff -C2 -d -r1.5 -r1.5.2.1 *** hammiefilter.py 2 Dec 2002 06:02:34 -0000 1.5 --- hammiefilter.py 10 Jan 2003 10:41:06 -0000 1.5.2.1 *************** *** 40,46 **** import sys import getopt ! import hammie ! import Options ! import StringIO # See Options.py for explanations of these properties --- 40,44 ---- import sys import getopt ! from spambayes import hammie, Options, StringIO # See Options.py for explanations of these properties Index: hammiesrv.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiesrv.py,v retrieving revision 1.10 retrieving revision 1.10.4.1 diff -C2 -d -r1.10 -r1.10.4.1 *** hammiesrv.py 7 Nov 2002 22:30:06 -0000 1.10 --- hammiesrv.py 10 Jan 2003 10:41:06 -0000 1.10.4.1 *************** *** 28,32 **** import traceback import xmlrpclib ! import hammie try: --- 28,32 ---- import traceback import xmlrpclib ! from spambayes import hammie try: Index: mailsort.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mailsort.py,v retrieving revision 1.1 retrieving revision 1.1.2.1 diff -C2 -d -r1.1 -r1.1.2.1 *** mailsort.py 29 Nov 2002 00:57:23 -0000 1.1 --- mailsort.py 10 Jan 2003 10:41:06 -0000 1.1.2.1 *************** *** 23,31 **** import socket import email ! import mboxutils ! import cdb ! from tokenizer import tokenize ! import classifier --- 23,31 ---- import socket import email ! from spambayes import mboxutils ! from spambayes import cdb ! from spambayes.tokenizer import tokenize ! from spambayes import classifier Index: mboxtrain.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mboxtrain.py,v retrieving revision 1.2 retrieving revision 1.2.2.1 diff -C2 -d -r1.2 -r1.2.2.1 *** mboxtrain.py 11 Dec 2002 16:21:15 -0000 1.2 --- mboxtrain.py 10 Jan 2003 10:41:06 -0000 1.2.2.1 *************** *** 35,43 **** """ ! import mboxutils ! import getopt ! import hammie ! import sys ! import os program = sys.argv[0] --- 35,40 ---- """ ! import sys, os, getopt ! from spambayes import hammie, mboxutils program = sys.argv[0] Index: pop3graph.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3graph.py,v retrieving revision 1.1 retrieving revision 1.1.2.1 diff -C2 -d -r1.1 -r1.1.2.1 *** pop3graph.py 20 Nov 2002 12:30:16 -0000 1.1 --- pop3graph.py 10 Jan 2003 10:41:06 -0000 1.1.2.1 *************** *** 5,11 **** from __future__ import division ! import sys, mboxutils ! from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory ! from Options import options def main(): --- 5,12 ---- from __future__ import division ! import sys ! from spambayes import mboxutils ! from spambayes.FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory ! from spambayes.Options import options def main(): Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.32 retrieving revision 1.32.2.1 diff -C2 -d -r1.32 -r1.32.2.1 *** pop3proxy.py 3 Dec 2002 21:22:22 -0000 1.32 --- pop3proxy.py 10 Jan 2003 10:41:06 -0000 1.32.2.1 *************** *** 141,149 **** import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect import socket, asyncore, asynchat, cgi, urlparse, webbrowser ! import mailbox, storage, tokenizer, mboxutils, email.Header ! from FileCorpus import FileCorpus, ExpiryFileCorpus ! from FileCorpus import FileMessageFactory, GzipFileMessageFactory from email.Iterators import typed_subpart_iterator ! from Options import options # HEADER_EXAMPLE is the longest possible header - the length of this one --- 141,150 ---- import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect import socket, asyncore, asynchat, cgi, urlparse, webbrowser ! import mailbox, email.Header ! from spambayes import storage, tokenizer, mboxutils ! from spambayes.FileCorpus import FileCorpus, ExpiryFileCorpus ! from spambayes.FileCorpus import FileMessageFactory, GzipFileMessageFactory from email.Iterators import typed_subpart_iterator ! from spambayes.Options import options # HEADER_EXAMPLE is the longest possible header - the length of this one Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.10 retrieving revision 1.10.2.1 diff -C2 -d -r1.10 -r1.10.2.1 *** setup.py 11 Dec 2002 04:56:30 -0000 1.10 --- setup.py 10 Jan 2003 10:41:06 -0000 1.10.2.1 *************** *** 1,4 **** --- 1,17 ---- from distutils.core import setup + import sys, email + if email.__version__ < '2.4.3': + print "Error: email package version < 2.4.3 found - need newer version" + print "See INTEGRATION.txt for download information" + sys.exit(0) + + compat_mods = [] + try: + import sets, heapq + except ImportError: + compat_mods = [ '22compat/sets.py', '22compat/heapq.py', ] + # TODO. Figure distutils magic out here. + setup( name='spambayes', *************** *** 7,37 **** 'hammiecli.py', 'hammiesrv.py', ! 'loosecksum.py', ! 'timtest.py', ! 'timcv.py', ! 'splitndirs.py', ! 'runtest.sh', ! 'rebal.py', ! 'HistToGNU.py', ! 'mboxcount.py', ! 'mboxtest.py', ! 'cmp.py', ! 'table.py', ! 'rates.py', ], ! py_modules=['classifier', ! 'tokenizer', ! 'hammie', ! 'msgs', ! 'storage', ! 'dbmstorage', ! 'Corpus', ! 'hammiebulk', ! 'chi2', ! 'Histogram', ! 'Options', ! 'Tester', ! 'TestDriver', ! 'mboxutils', ! ] ) --- 20,28 ---- 'hammiecli.py', 'hammiesrv.py', ! 'hammiefilter.py', ! 'pop3graph.py', ! 'pop3proxy.py', ], ! ! packages = [ 'spambayes', ] ) --- Corpus.py DELETED --- --- CostCounter.py DELETED --- --- FileCorpus.py DELETED --- --- HistToGNU.py DELETED --- --- Histogram.py DELETED --- --- Options.py DELETED --- --- TestDriver.py DELETED --- --- Tester.py DELETED --- --- cdb.py DELETED --- --- chi2.py DELETED --- --- classifier.py DELETED --- --- cmp.py DELETED --- --- dbmstorage.py DELETED --- --- fpfn.py DELETED --- --- hammiebulk.py DELETED --- --- heapq.py DELETED --- --- loosecksum.py DELETED --- --- mboxcount.py DELETED --- --- mboxtest.py DELETED --- --- mboxutils.py DELETED --- --- msgs.py DELETED --- --- optimize.py DELETED --- --- rates.py DELETED --- --- rebal.py DELETED --- --- sets.py DELETED --- --- simplexloop.py DELETED --- --- split.py DELETED --- --- splitn.py DELETED --- --- splitndirs.py DELETED --- --- storage.py DELETED --- --- table.py DELETED --- --- timcv.py DELETED --- --- timtest.py DELETED --- --- tokenizer.py DELETED --- --- weaktest.py DELETED --- From anthonybaxter at users.sourceforge.net Fri Jan 10 02:41:09 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 05:41:14 2003 Subject: [Spambayes-checkins] spambayes/22compat heapq.py,NONE,1.1.2.1 sets.py,NONE,1.1.2.1 Message-ID: Update of /cvsroot/spambayes/spambayes/22compat In directory sc8-pr-cvs1:/tmp/cvs-serv9389/22compat Added Files: Tag: reorg-branch heapq.py sets.py Log Message: Checkpointing before I head home. Still to do: - distutils magic to make sure that the 22compat modules are installed when needed. - Walking through testtools and utilities and fixing imports. - Documentation. hammie works, everything else that people use in day-to-day operation should work - please give it a go. --- NEW FILE: heapq.py --- # -*- coding: Latin-1 -*- """Heap queue algorithm (a.k.a. priority queue). Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for all k, counting elements from 0. For the sake of comparison, non-existing elements are considered to be infinite. The interesting property of a heap is that a[0] is always its smallest element. Usage: heap = [] # creates an empty heap heappush(heap, item) # pushes a new item on the heap item = heappop(heap) # pops the smallest item from the heap item = heap[0] # smallest item on the heap without popping it heapify(x) # transforms list into a heap, in-place, in linear time item = heapreplace(heap, item) # pops and returns smallest item, and adds # new item; the heap size is unchanged Our API differs from textbook heap algorithms as follows: - We use 0-based indexing. This makes the relationship between the index for a node and the indexes for its children slightly less obvious, but is more suitable since Python uses 0-based indexing. - Our heappop() method returns the smallest item, not the largest. These two make it possible to view the heap as a regular Python list without surprises: heap[0] is the smallest item, and heap.sort() maintains the heap invariant! """ # Original code by Kevin O'Connor, augmented by Tim Peters __about__ = """Heap queues [explanation by François Pinard] Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for all k, counting elements from 0. For the sake of comparison, non-existing elements are considered to be infinite. The interesting property of a heap is that a[0] is always its smallest element. The strange invariant above is meant to be an efficient memory representation for a tournament. The numbers below are `k', not a[k]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 In the tree above, each cell `k' is topping `2*k+1' and `2*k+2'. In an usual binary tournament we see in sports, each cell is the winner over the two cells it tops, and we can trace the winner down the tree to see all opponents s/he had. However, in many computer applications of such tournaments, we do not need to trace the history of a winner. To be more memory efficient, when a winner is promoted, we try to replace it by something else at a lower level, and the rule becomes that a cell and the two cells it tops contain three different items, but the top cell "wins" over the two topped cells. If this heap invariant is protected at all time, index 0 is clearly the overall winner. The simplest algorithmic way to remove it and find the "next" winner is to move some loser (let's say cell 30 in the diagram above) into the 0 position, and then percolate this new 0 down the tree, exchanging values, until the invariant is re-established. This is clearly logarithmic on the total number of items in the tree. By iterating over all items, you get an O(n ln n) sort. A nice feature of this sort is that you can efficiently insert new items while the sort is going on, provided that the inserted items are not "better" than the last 0'th element you extracted. This is especially useful in simulation contexts, where the tree holds all incoming events, and the "win" condition means the smallest scheduled time. When an event schedule other events for execution, they are scheduled into the future, so they can easily go into the heap. So, a heap is a good structure for implementing schedulers (this is what I used for my MIDI sequencer :-). Various structures for implementing schedulers have been extensively studied, and heaps are good for this, as they are reasonably speedy, the speed is almost constant, and the worst case is not much different than the average case. However, there are other representations which are more efficient overall, yet the worst cases might be terrible. Heaps are also very useful in big disk sorts. You most probably all know that a big sort implies producing "runs" (which are pre-sorted sequences, which size is usually related to the amount of CPU memory), followed by a merging passes for these runs, which merging is often very cleverly organised[1]. It is very important that the initial sort produces the longest runs possible. Tournaments are a good way to that. If, using all the memory available to hold a tournament, you replace and percolate items that happen to fit the current run, you'll produce runs which are twice the size of the memory for random input, and much better for input fuzzily ordered. Moreover, if you output the 0'th item on disk and get an input which may not fit in the current tournament (because the value "wins" over the last output value), it cannot fit in the heap, so the size of the heap decreases. The freed memory could be cleverly reused immediately for progressively building a second heap, which grows at exactly the same rate the first heap is melting. When the first heap completely vanishes, you switch heaps and start a new run. Clever and quite effective! In a word, heaps are useful memory structures to know. I use them in a few applications, and I think it is good to keep a `heap' module around. :-) -------------------- [1] The disk balancing algorithms which are current, nowadays, are more annoying than clever, and this is a consequence of the seeking capabilities of the disks. On devices which cannot seek, like big tape drives, the story was quite different, and one had to be very clever to ensure (far in advance) that each tape movement will be the most effective possible (that is, will best participate at "progressing" the merge). Some tapes were even able to read backwards, and this was also used to avoid the rewinding time. Believe me, real good tape sorts were quite spectacular to watch! >From all times, sorting has always been a Great Art! :-) """ def heappush(heap, item): """Push item onto heap, maintaining the heap invariant.""" heap.append(item) _siftdown(heap, 0, len(heap)-1) def heappop(heap): """Pop the smallest item off the heap, maintaining the heap invariant.""" lastelt = heap.pop() # raises appropriate IndexError if heap is empty if heap: returnitem = heap[0] heap[0] = lastelt _siftup(heap, 0) else: returnitem = lastelt return returnitem def heapreplace(heap, item): """Pop and return the current smallest value, and add the new item. This is more efficient than heappop() followed by heappush(), and can be more appropriate when using a fixed-size heap. Note that the value returned may be larger than item! That constrains reasonable uses of this routine. """ returnitem = heap[0] # raises appropriate IndexError if heap is empty heap[0] = item _siftup(heap, 0) return returnitem def heapify(x): """Transform list into a heap, in-place, in O(len(heap)) time.""" n = len(x) # Transform bottom-up. The largest index there's any point to looking at # is the largest with a child index in-range, so must have 2*i + 1 < n, # or i < (n-1)/2. If n is even = 2*j, this is (2*j-1)/2 = j-1/2 so # j-1 is the largest, which is n//2 - 1. If n is odd = 2*j+1, this is # (2*j+1-1)/2 = j so j-1 is the largest, and that's again n//2-1. for i in xrange(n//2 - 1, -1, -1): _siftup(x, i) # 'heap' is a heap at all indices >= startpos, except possibly for pos. pos # is the index of a leaf with a possibly out-of-order value. Restore the # heap invariant. def _siftdown(heap, startpos, pos): newitem = heap[pos] # Follow the path to the root, moving parents down until finding a place # newitem fits. while pos > startpos: parentpos = (pos - 1) >> 1 parent = heap[parentpos] if parent <= newitem: break heap[pos] = parent pos = parentpos heap[pos] = newitem # The child indices of heap index pos are already heaps, and we want to make # a heap at index pos too. We do this by bubbling the smaller child of # pos up (and so on with that child's children, etc) until hitting a leaf, # then using _siftdown to move the oddball originally at index pos into place. # # We *could* break out of the loop as soon as we find a pos where newitem <= # both its children, but turns out that's not a good idea, and despite that # many books write the algorithm that way. During a heap pop, the last array # element is sifted in, and that tends to be large, so that comparing it # against values starting from the root usually doesn't pay (= usually doesn't # get us out of the loop early). See Knuth, Volume 3, where this is # explained and quantified in an exercise. # # Cutting the # of comparisons is important, since these routines have no # way to extract "the priority" from an array element, so that intelligence # is likely to be hiding in custom __cmp__ methods, or in array elements # storing (priority, record) tuples. Comparisons are thus potentially # expensive. # # On random arrays of length 1000, making this change cut the number of # comparisons made by heapify() a little, and those made by exhaustive # heappop() a lot, in accord with theory. Here are typical results from 3 # runs (3 just to demonstrate how small the variance is): # # Compares needed by heapify Compares needed by 1000 heapppops # -------------------------- --------------------------------- # 1837 cut to 1663 14996 cut to 8680 # 1855 cut to 1659 14966 cut to 8678 # 1847 cut to 1660 15024 cut to 8703 # # Building the heap by using heappush() 1000 times instead required # 2198, 2148, and 2219 compares: heapify() is more efficient, when # you can use it. # # The total compares needed by list.sort() on the same lists were 8627, # 8627, and 8632 (this should be compared to the sum of heapify() and # heappop() compares): list.sort() is (unsurprisingly!) more efficient # for sorting. def _siftup(heap, pos): endpos = len(heap) startpos = pos newitem = heap[pos] # Bubble up the smaller child until hitting a leaf. childpos = 2*pos + 1 # leftmost child position while childpos < endpos: # Set childpos to index of smaller child. rightpos = childpos + 1 if rightpos < endpos and heap[rightpos] <= heap[childpos]: childpos = rightpos # Move the smaller child up. heap[pos] = heap[childpos] pos = childpos childpos = 2*pos + 1 # The leaf at pos is empty now. Put newitem there, and and bubble it up # to its final resting place (by sifting its parents down). heap[pos] = newitem _siftdown(heap, startpos, pos) if __name__ == "__main__": # Simple sanity test heap = [] data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 0] for item in data: heappush(heap, item) sort = [] while heap: sort.append(heappop(heap)) print sort --- NEW FILE: sets.py --- """Classes to represent arbitrary sets (including sets of sets). This module implements sets using dictionaries whose values are ignored. The usual operations (union, intersection, deletion, etc.) are provided as both methods and operators. Important: sets are not sequences! While they support 'x in s', 'len(s)', and 'for x in s', none of those operations are unique for sequences; for example, mappings support all three as well. The characteristic operation for sequences is subscripting with small integers: s[i], for i in range(len(s)). Sets don't support subscripting at all. Also, sequences allow multiple occurrences and their elements have a definite order; sets on the other hand don't record multiple occurrences and don't remember the order of element insertion (which is why they don't support s[i]). The following classes are provided: BaseSet -- All the operations common to both mutable and immutable sets. This is an abstract class, not meant to be directly instantiated. Set -- Mutable sets, subclass of BaseSet; not hashable. ImmutableSet -- Immutable sets, subclass of BaseSet; hashable. An iterable argument is mandatory to create an ImmutableSet. _TemporarilyImmutableSet -- Not a subclass of BaseSet: just a wrapper around a Set, hashable, giving the same hash value as the immutable set equivalent would have. Do not use this class directly. Only hashable objects can be added to a Set. In particular, you cannot really add a Set as an element to another Set; if you try, what is actually added is an ImmutableSet built from it (it compares equal to the one you tried adding). When you ask if `x in y' where x is a Set and y is a Set or ImmutableSet, x is wrapped into a _TemporarilyImmutableSet z, and what's tested is actually `z in y'. """ # Code history: # # - Greg V. Wilson wrote the first version, using a different approach # to the mutable/immutable problem, and inheriting from dict. # # - Alex Martelli modified Greg's version to implement the current # Set/ImmutableSet approach, and make the data an attribute. # # - Guido van Rossum rewrote much of the code, made some API changes, # and cleaned up the docstrings. # # - Raymond Hettinger added a number of speedups and other # improvements. __all__ = ['BaseSet', 'Set', 'ImmutableSet'] try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 class BaseSet(object): """Common base class for mutable and immutable sets.""" __slots__ = ['_data'] # Constructor def __init__(self): """This is an abstract class.""" # Don't call this from a concrete subclass! if self.__class__ is BaseSet: raise TypeError, ("BaseSet is an abstract class. " "Use Set or ImmutableSet.") # Standard protocols: __len__, __repr__, __str__, __iter__ def __len__(self): """Return the number of elements of a set.""" return len(self._data) def __repr__(self): """Return string representation of a set. This looks like 'Set([])'. """ return self._repr() # __str__ is the same as __repr__ __str__ = __repr__ def _repr(self, sorted=False): elements = self._data.keys() if sorted: elements.sort() return '%s(%r)' % (self.__class__.__name__, elements) def __iter__(self): """Return an iterator over the elements or a set. This is the keys iterator for the underlying dict. """ return self._data.iterkeys() # Equality comparisons using the underlying dicts def __eq__(self, other): self._binary_sanity_check(other) return self._data == other._data def __ne__(self, other): self._binary_sanity_check(other) return self._data != other._data # Copying operations def copy(self): """Return a shallow copy of a set.""" result = self.__class__() result._data.update(self._data) return result __copy__ = copy # For the copy module def __deepcopy__(self, memo): """Return a deep copy of a set; used by copy module.""" # This pre-creates the result and inserts it in the memo # early, in case the deep copy recurses into another reference # to this same set. A set can't be an element of itself, but # it can certainly contain an object that has a reference to # itself. from copy import deepcopy result = self.__class__() memo[id(self)] = result data = result._data value = True for elt in self: data[deepcopy(elt, memo)] = value return result # Standard set operations: union, intersection, both differences. # Each has an operator version (e.g. __or__, invoked with |) and a # method version (e.g. union). # Subtle: Each pair requires distinct code so that the outcome is # correct when the type of other isn't suitable. For example, if # we did "union = __or__" instead, then Set().union(3) would return # NotImplemented instead of raising TypeError (albeit that *why* it # raises TypeError as-is is also a bit subtle). def __or__(self, other): """Return the union of two sets as a new set. (I.e. all elements that are in either set.) """ if not isinstance(other, BaseSet): return NotImplemented result = self.__class__() result._data = self._data.copy() result._data.update(other._data) return result def union(self, other): """Return the union of two sets as a new set. (I.e. all elements that are in either set.) """ return self | other def __and__(self, other): """Return the intersection of two sets as a new set. (I.e. all elements that are in both sets.) """ if not isinstance(other, BaseSet): return NotImplemented if len(self) <= len(other): little, big = self, other else: little, big = other, self common = filter(big._data.has_key, little._data.iterkeys()) return self.__class__(common) def intersection(self, other): """Return the intersection of two sets as a new set. (I.e. all elements that are in both sets.) """ return self & other def __xor__(self, other): """Return the symmetric difference of two sets as a new set. (I.e. all elements that are in exactly one of the sets.) """ if not isinstance(other, BaseSet): return NotImplemented result = self.__class__() data = result._data value = True selfdata = self._data otherdata = other._data for elt in selfdata: if elt not in otherdata: data[elt] = value for elt in otherdata: if elt not in selfdata: data[elt] = value return result def symmetric_difference(self, other): """Return the symmetric difference of two sets as a new set. (I.e. all elements that are in exactly one of the sets.) """ return self ^ other def __sub__(self, other): """Return the difference of two sets as a new Set. (I.e. all elements that are in this set and not in the other.) """ if not isinstance(other, BaseSet): return NotImplemented result = self.__class__() data = result._data otherdata = other._data value = True for elt in self: if elt not in otherdata: data[elt] = value return result def difference(self, other): """Return the difference of two sets as a new Set. (I.e. all elements that are in this set and not in the other.) """ return self - other # Membership test def __contains__(self, element): """Report whether an element is a member of a set. (Called in response to the expression `element in self'.) """ try: return element in self._data except TypeError: transform = getattr(element, "_as_temporarily_immutable", None) if transform is None: raise # re-raise the TypeError exception we caught return transform() in self._data # Subset and superset test def issubset(self, other): """Report whether another set contains this set.""" self._binary_sanity_check(other) if len(self) > len(other): # Fast check for obvious cases return False otherdata = other._data for elt in self: if elt not in otherdata: return False return True def issuperset(self, other): """Report whether this set contains another set.""" self._binary_sanity_check(other) if len(self) < len(other): # Fast check for obvious cases return False selfdata = self._data for elt in other: if elt not in selfdata: return False return True # Inequality comparisons using the is-subset relation. __le__ = issubset __ge__ = issuperset def __lt__(self, other): self._binary_sanity_check(other) return len(self) < len(other) and self.issubset(other) def __gt__(self, other): self._binary_sanity_check(other) return len(self) > len(other) and self.issuperset(other) # Assorted helpers def _binary_sanity_check(self, other): # Check that the other argument to a binary operation is also # a set, raising a TypeError otherwise. if not isinstance(other, BaseSet): raise TypeError, "Binary operation only permitted between sets" def _compute_hash(self): # Calculate hash code for a set by xor'ing the hash codes of # the elements. This ensures that the hash code does not depend # on the order in which elements are added to the set. This is # not called __hash__ because a BaseSet should not be hashable; # only an ImmutableSet is hashable. result = 0 for elt in self: result ^= hash(elt) return result def _update(self, iterable): # The main loop for update() and the subclass __init__() methods. data = self._data # Use the fast update() method when a dictionary is available. if isinstance(iterable, BaseSet): data.update(iterable._data) return if isinstance(iterable, dict): data.update(iterable) return value = True it = iter(iterable) while True: try: for element in it: data[element] = value return except TypeError: transform = getattr(element, "_as_immutable", None) if transform is None: raise # re-raise the TypeError exception we caught data[transform()] = value class ImmutableSet(BaseSet): """Immutable set class.""" __slots__ = ['_hashcode'] # BaseSet + hashing def __init__(self, iterable=None): """Construct an immutable set from an optional iterable.""" self._hashcode = None self._data = {} if iterable is not None: self._update(iterable) def __hash__(self): if self._hashcode is None: self._hashcode = self._compute_hash() return self._hashcode class Set(BaseSet): """ Mutable set class.""" __slots__ = [] # BaseSet + operations requiring mutability; no hashing def __init__(self, iterable=None): """Construct a set from an optional iterable.""" self._data = {} if iterable is not None: self._update(iterable) def __hash__(self): """A Set cannot be hashed.""" # We inherit object.__hash__, so we must deny this explicitly raise TypeError, "Can't hash a Set, only an ImmutableSet." # In-place union, intersection, differences. # Subtle: The xyz_update() functions deliberately return None, # as do all mutating operations on built-in container types. # The __xyz__ spellings have to return self, though. def __ior__(self, other): """Update a set with the union of itself and another.""" self._binary_sanity_check(other) self._data.update(other._data) return self def union_update(self, other): """Update a set with the union of itself and another.""" self |= other def __iand__(self, other): """Update a set with the intersection of itself and another.""" self._binary_sanity_check(other) self._data = (self & other)._data return self def intersection_update(self, other): """Update a set with the intersection of itself and another.""" self &= other def __ixor__(self, other): """Update a set with the symmetric difference of itself and another.""" self._binary_sanity_check(other) data = self._data value = True for elt in other: if elt in data: del data[elt] else: data[elt] = value return self def symmetric_difference_update(self, other): """Update a set with the symmetric difference of itself and another.""" self ^= other def __isub__(self, other): """Remove all elements of another set from this set.""" self._binary_sanity_check(other) data = self._data for elt in other: if elt in data: del data[elt] return self def difference_update(self, other): """Remove all elements of another set from this set.""" self -= other # Python dict-like mass mutations: update, clear def update(self, iterable): """Add all values from an iterable (such as a list or file).""" self._update(iterable) def clear(self): """Remove all elements from this set.""" self._data.clear() # Single-element mutations: add, remove, discard def add(self, element): """Add an element to a set. This has no effect if the element is already present. """ try: self._data[element] = True except TypeError: transform = getattr(element, "_as_immutable", None) if transform is None: raise # re-raise the TypeError exception we caught self._data[transform()] = True def remove(self, element): """Remove an element from a set; it must be a member. If the element is not a member, raise a KeyError. """ try: del self._data[element] except TypeError: transform = getattr(element, "_as_temporarily_immutable", None) if transform is None: raise # re-raise the TypeError exception we caught del self._data[transform()] def discard(self, element): """Remove an element from a set if it is a member. If the element is not a member, do nothing. """ try: self.remove(element) except KeyError: pass def pop(self): """Remove and return an arbitrary set element.""" return self._data.popitem()[0] def _as_immutable(self): # Return a copy of self as an immutable set return ImmutableSet(self) def _as_temporarily_immutable(self): # Return self wrapped in a temporarily immutable set return _TemporarilyImmutableSet(self) class _TemporarilyImmutableSet(BaseSet): # Wrap a mutable set as if it was temporarily immutable. # This only supplies hashing and equality comparisons. def __init__(self, set): self._set = set self._data = set._data # Needed by ImmutableSet.__eq__() def __hash__(self): return self._set._compute_hash() From anthonybaxter at users.sourceforge.net Fri Jan 10 02:41:10 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 05:41:15 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.40,1.40.2.1 msgstore.py,1.36,1.36.2.1 train.py,1.22,1.22.2.1 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv9389/Outlook2000 Modified Files: Tag: reorg-branch manager.py msgstore.py train.py Log Message: Checkpointing before I head home. Still to do: - distutils magic to make sure that the 22compat modules are installed when needed. - Walking through testtools and utilities and fixing imports. - Documentation. hammie works, everything else that people use in day-to-day operation should work - please give it a go. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.40 retrieving revision 1.40.2.1 diff -C2 -d -r1.40 -r1.40.2.1 *** manager.py 10 Jan 2003 00:33:10 -0000 1.40 --- manager.py 10 Jan 2003 10:41:07 -0000 1.40.2.1 *************** *** 44,48 **** os.environ["BAYESCUSTOMIZE"] = ini_filename try: ! import classifier except ImportError: parent = os.path.abspath(os.path.join(os.path.dirname(this_filename), --- 44,48 ---- os.environ["BAYESCUSTOMIZE"] = ini_filename try: ! from spambayes import classifier except ImportError: parent = os.path.abspath(os.path.join(os.path.dirname(this_filename), *************** *** 50,55 **** sys.path.insert(0, parent) ! import classifier ! from tokenizer import tokenize bayes_classifier = classifier bayes_tokenize = tokenize --- 50,55 ---- sys.path.insert(0, parent) ! from spambayes import classifier ! from spambayes.tokenizer import tokenize bayes_classifier = classifier bayes_tokenize = tokenize Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.36 retrieving revision 1.36.2.1 diff -C2 -d -r1.36 -r1.36.2.1 *** msgstore.py 25 Nov 2002 05:57:41 -0000 1.36 --- msgstore.py 10 Jan 2003 10:41:07 -0000 1.36.2.1 *************** *** 431,435 **** # Note we *dont* look in plain text attachments, which we arguably # should. ! import mboxutils self._EnsureObject() --- 431,435 ---- # Note we *dont* look in plain text attachments, which we arguably # should. ! from spambayes import mboxutils self._EnsureObject() Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.22 retrieving revision 1.22.2.1 diff -C2 -d -r1.22 -r1.22.2.1 *** train.py 16 Dec 2002 04:12:00 -0000 1.22 --- train.py 10 Jan 2003 10:41:07 -0000 1.22.2.1 *************** *** 34,38 **** # If re-classified AND rescore = True, then a new score will # be written to the message (so the user can see some effects) ! from tokenizer import tokenize was_spam = mgr.message_db.get(msg.searchkey) --- 34,38 ---- # If re-classified AND rescore = True, then a new score will # be written to the message (so the user can see some effects) ! from spambayes.tokenizer import tokenize was_spam = mgr.message_db.get(msg.searchkey) *************** *** 63,67 **** # False == was_ham def untrain_message(msg, mgr): ! from tokenizer import tokenize stream = msg.GetEmailPackageObject() if been_trained_as_spam(msg, mgr): --- 63,67 ---- # False == was_ham def untrain_message(msg, mgr): ! from spambayes.tokenizer import tokenize stream = msg.GetEmailPackageObject() if been_trained_as_spam(msg, mgr): From anthonybaxter at users.sourceforge.net Fri Jan 10 02:41:10 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 05:41:16 2003 Subject: [Spambayes-checkins] spambayes/pspam/pspam profile.py,1.6,1.6.2.1 Message-ID: Update of /cvsroot/spambayes/spambayes/pspam/pspam In directory sc8-pr-cvs1:/tmp/cvs-serv9389/pspam/pspam Modified Files: Tag: reorg-branch profile.py Log Message: Checkpointing before I head home. Still to do: - distutils magic to make sure that the 22compat modules are installed when needed. - Walking through testtools and utilities and fixing imports. - Documentation. hammie works, everything else that people use in day-to-day operation should work - please give it a go. Index: profile.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pspam/pspam/profile.py,v retrieving revision 1.6 retrieving revision 1.6.2.1 diff -C2 -d -r1.6 -r1.6.2.1 *** profile.py 26 Nov 2002 17:16:35 -0000 1.6 --- profile.py 10 Jan 2003 10:41:07 -0000 1.6.2.1 *************** *** 6,11 **** from BTrees.OOBTree import OOBTree ! import classifier ! from tokenizer import tokenize from pspam.folder import Folder --- 6,11 ---- from BTrees.OOBTree import OOBTree ! from spambayes import classifier ! from spambayes.tokenizer import tokenize from pspam.folder import Folder From anthonybaxter at users.sourceforge.net Fri Jan 10 02:41:10 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 05:41:20 2003 Subject: [Spambayes-checkins] spambayes/utilities HistToGNU.py,NONE,1.1.2.1 loosecksum.py,NONE,1.1.2.1 mboxcount.py,NONE,1.1.2.1 rebal.py,NONE,1.1.2.1 split.py,NONE,1.1.2.1 splitn.py,NONE,1.1.2.1 splitndirs.py,NONE,1.1.2.1 Message-ID: Update of /cvsroot/spambayes/spambayes/utilities In directory sc8-pr-cvs1:/tmp/cvs-serv9389/utilities Added Files: Tag: reorg-branch HistToGNU.py loosecksum.py mboxcount.py rebal.py split.py splitn.py splitndirs.py Log Message: Checkpointing before I head home. Still to do: - distutils magic to make sure that the 22compat modules are installed when needed. - Walking through testtools and utilities and fixing imports. - Documentation. hammie works, everything else that people use in day-to-day operation should work - please give it a go. --- NEW FILE: HistToGNU.py --- #! /usr/bin/env python """HistToGNU.py Convert saved binary pickle of histograms to gnu plot output Usage: %(program)s [options] [histogrampicklefile ...] reads pickle filename from options if not specified writes to stdout """ globalOptions = """ set grid set xtics 5 set xrange [0.0:100.0] """ dataSetOptions="smooth unique" from Options import options from TestDriver import Hist import sys import cPickle as pickle program = sys.argv[0] def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def loadHist(path): """Load the histogram pickle object""" return pickle.load(file(path)) def outputHist(hist, f=sys.stdout): """Output the Hist object to file f""" hist.fill_buckets() for i in range(len(hist.buckets)): n = hist.buckets[i] f.write("%.3f %d\n" % ( (100.0 * i) / hist.nbuckets, n)) def plot(files): """given a list of files, create gnu-plot file""" import cStringIO, os cmd = cStringIO.StringIO() cmd.write(globalOptions) args = [] for file in files: args.append("""'-' %s title "%s" """ % (dataSetOptions, file)) cmd.write('plot %s\n' % ",".join(args)) for file in files: outputHist(loadHist(file), cmd) cmd.write('e\n') cmd.write('pause 100\n') print cmd.getvalue() def main(): import getopt try: opts, args = getopt.getopt(sys.argv[1:], '', []) except getopt.error, msg: usage(1, msg) if not args and options.save_histogram_pickles: args = [] for f in ('ham', 'spam'): fname = "%s_%shist.pik" % (options.pickle_basename, f) args.append(fname) if args: plot(args) else: print "could not locate any files to plot" if __name__ == "__main__": main() --- NEW FILE: loosecksum.py --- #!/usr/local/bin/python """ Compute a 'loose' checksum on the msg (file on cmdline or via stdin). Attempts are made to eliminate content which tends to obscure the 'sameness' of messages. This is aimed particularly at spam, which tends to contains lots of small differences across messages to try and thwart spam filters, in hopes that at least one copy reaches its desitination. Before calculating the checksum, this script does the following: * delete the message header * delete HTML tags which generally contain URLs * delete anything which looks like an email address or URL * finally, discard everything other than ascii letters and digits (note that this will almost certainly be ineffectual for spam written in eastern languages such as Korean) An MD5 checksum is then computed for the resulting text and written to stdout. """ import getopt import sys import email.Parser import md5 import re import time import binascii def zaptags(data, *tags): """delete all tags (and /tags) from input data given as arguments""" for pat in tags: pat = pat.split(":") sub = "" if len(pat) >= 2: sub = pat[-1] pat = ":".join(pat[:-1]) else: pat = pat[0] sub = "" if '\\' in sub: sub = _zap_esc_map(sub) try: data = re.sub(r'(?i)]*)?>'%pat, sub, data) except TypeError: print (pat, sub, data) raise return data def clean(data): """Clean the obviously variable stuff from a chunk of data. The first (and perhaps only) use of this is to try and eliminate bits of data that keep multiple spam email messages from looking the same. """ # Get rid of any HTML tags that hold URLs - tend to have varying content # I suppose i could just get rid of all HTML tags data = zaptags(data, 'a', 'img', 'base', 'frame') # delete anything that looks like an email address data = re.sub(r"(?i)[-a-z0-9_.+]+@[-a-z0-9_.]+\.([a-z]+)", "", data) # delete anything that looks like a url (catch bare urls) data = re.sub(r"(?i)(ftp|http|gopher)://[-a-z0-9_/?&%@=+:;#!~|.,$*]+", "", data) # delete pmguid: stuff (turns up frequently) data = re.sub(r"pmguid:[^.\s]+(\.[^.\s]+)*", "", data) # throw away everything other than alpha & digits return re.sub(r"[^A-Za-z0-9]+", "", data) def flatten(obj): # I do not know how to use the email package very well - all I want here # is the body of obj expressed as a string - there is probably a better # way to accomplish this which I haven't discovered. # three types are possible: string, Message (hasattr(get_payload)), list if isinstance(obj, str): return obj if hasattr(obj, "get_payload"): return flatten(obj.get_payload()) if isinstance(obj, list): return "\n".join([flatten(b) for b in obj]) raise TypeError, ("unrecognized body type: %s" % type(obj)) def generate_checksum(f): body = flatten(email.Parser.Parser().parse(f)) return binascii.b2a_hex(md5.new(clean(body)).digest()) def main(args): opts, args = getopt.getopt(args, "") for opt, arg in opts: pass if not args: inf = sys.stdin else: inf = file(args[0]) print generate_checksum(inf) if __name__ == "__main__": main(sys.argv[1:]) --- NEW FILE: mboxcount.py --- #! /usr/bin/env python """Count the number of messages in Unix mboxes. Usage: %(programs)s [-g] [-h] path1 ... Options: -h Print this help message and exit -g Do globbing on each path. This is helpful on Windows, where the native shells don't glob. """ """ Stats for Barry's corpora, as of 26-Aug-2002, using then-current 2.3a0: edu-sig-clean.mbox 252 (+ unparseable: 0) python-dev-clean.mbox 8326 (+ unparseable: 0) mailman-developers-clean.mbox 2427 (+ unparseable: 0) python-list-clean.mbox 159072 (+ unparseable: 2) zope3-clean.mbox 2177 (+ unparseable: 0) Unparseable messages are likely spam. zope3-clean.mbox is really from the zope3-dev mailing list. The Python version matters because the email package varies across releases in whether it uses strict or lax parsing. """ import sys import mailbox import email import getopt import glob from mboxutils import get_message try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 program = sys.argv[0] def usage(code, msg=''): print >> sys.stderr, __doc__ if msg: print >> sys.stderr, msg sys.exit(code) def count(fname): fp = open(fname, 'rb') mbox = mailbox.PortableUnixMailbox(fp, get_message) goodcount = 0 badcount = 0 for msg in mbox: if msg["to"] is None and msg["cc"] is None: badcount += 1 else: goodcount += 1 fp.close() return goodcount, badcount def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'hg', ['help']) except getopt.error, msg: usage(1, msg) doglob = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt == '-g': doglob = True for path in args: if doglob: fnames = glob.glob(path) else: fnames = [path] for fname in fnames: goodn, badn = count(fname) print "%-35s %7d (+ unparseable: %d)" % (fname, goodn, badn) if __name__ == '__main__': main() --- NEW FILE: rebal.py --- #!/usr/bin/env python """ rebal.py - rebalance a ham or spam directory, moving files to or from a reservoir directory as necessary. usage: rebal.py [ options ] options: -d - dry run; display what would be moved, but don't do it [%(DRYRUN)s] -r res - specify an alternate reservoir [%(RESDIR)s] -s set - specify an alternate Set pfx [%(SETPFX)s] -n num - specify number of files per Set dir desired [%(NPERDIR)s] -v - tell user what's happening [%(VERBOSE)s] -q - be quiet about what's happening [not %(VERBOSE)s] -c - confirm file moves into Set directory [%(CONFIRM)s] -Q - don't confirm moves; this is independent of -v/-q The script will work with a variable number of Set directories, but they must already exist. Example: rebal.py -r reservoir -s Set -n 300 This will move random files between the directory 'reservoir' and the various subdirectories prefixed with 'Set', making sure no more than 300 files are left in the 'Set' directories when finished. Example: Suppose you want to shuffle your Set files around, winding up with 300 files in each one, you can execute: rebal.py -n 0 rebal.py -n 300 The first run will move all files from the various Data/Ham/Set directories to the Data/Ham/reservoir directory. The second run will randomly parcel out 300 files to each of the Data/Ham/Set directories. """ import os import sys import random import glob import getopt try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 # defaults NPERDIR = 4000 RESDIR = 'Data/Ham/reservoir' SETPFX = 'Data/Ham/Set' VERBOSE = True CONFIRM = True DRYRUN = False def usage(msg): msg = str(msg) if msg: print >> sys.stderr, msg print >> sys.stderr, """\ usage: rebal.py [ options ] options: -d - dry run; display what would be moved, but don't do it [%(DRYRUN)s] -r res - specify an alternate reservoir [%(RESDIR)s] -s set - specify an alternate Set pfx [%(SETPFX)s] -n num - specify number of files per dir [%(NPERDIR)s] -v - tell user what's happening [%(VERBOSE)s] -q - be quiet about what's happening [not %(VERBOSE)s] -c - confirm file moves into Set directory [%(CONFIRM)s] -Q - be quiet and don't confirm moves """ % globals() def migrate(f, dir, verbose): """rename f into dir, making sure to avoid name clashes.""" base = os.path.split(f)[-1] out = os.path.join(dir, base) while os.path.exists(out): basename, ext = os.path.splitext(base) digits = random.randrange(100000000) out = os.path.join(dir, str(digits) + ext) if verbose: print "moving", f, "to", out os.rename(f, out) def main(args): nperdir = NPERDIR resdir = RESDIR setpfx = SETPFX verbose = VERBOSE confirm = CONFIRM dryrun = DRYRUN try: opts, args = getopt.getopt(args, "dr:s:n:vqcQh") except getopt.GetoptError, msg: usage(msg) return 1 for opt, arg in opts: if opt == "-n": nperdir = int(arg) elif opt == "-r": resdir = arg elif opt == "-s": setpfx = arg elif opt == "-v": verbose = True elif opt == "-c": confirm = True elif opt == "-q": verbose = False elif opt == "-Q": confirm = False elif opt == "-d": dryrun = True elif opt == "-h": usage('') return 0 res = os.listdir(resdir) dirs = glob.glob(setpfx+"*") if dirs == []: print >> sys.stderr, "no directories beginning with", setpfx, "exist." return 1 stuff = [] n = len(res) for dir in dirs: fs = os.listdir(dir) n += len(fs) stuff.append((dir, fs)) if nperdir * len(dirs) > n: print >> sys.stderr, "not enough files to go around - use lower -n." return 1 # weak check against mixing ham and spam if (setpfx.find("Ham") >= 0 and resdir.find("Spam") >= 0 or setpfx.find("Spam") >= 0 and resdir.find("Ham") >= 0): yn = raw_input("Reservoir and Set dirs appear not to match. " "Continue? (y/n) ") if yn.lower()[0:1] != 'y': return 1 # if necessary, migrate random files to the reservoir for (dir, fs) in stuff: if nperdir >= len(fs): continue random.shuffle(fs) movethese = fs[nperdir:] del fs[nperdir:] if dryrun: print "would move", len(movethese), "files from", dir, \ "to reservoir", resdir else: for f in movethese: migrate(os.path.join(dir, f), resdir, verbose) res.extend(movethese) # randomize reservoir once so we can just bite chunks from the front random.shuffle(res) # grow Set* directories from the reservoir for (dir, fs) in stuff: if nperdir == len(fs): continue movethese = res[:nperdir-len(fs)] res = res[nperdir-len(fs):] if dryrun: print "would move", len(movethese), "files from reservoir", \ resdir, "to", dir else: for f in movethese: if confirm: print file(os.path.join(resdir, f)).read() ok = raw_input('good enough? ').lower() if not ok.startswith('y'): continue migrate(os.path.join(resdir, f), dir, verbose) fs.extend(movethese) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:])) --- NEW FILE: split.py --- #! /usr/bin/env python """Split an mbox into two files based on a given percentage. This script will troll through a Unix mbox file randomly assigning each message to one of two bins. The split is based on a given float percentage. E.g. % split.py sourcembox 20 mbox20 mbox80 yields two mbox files, where mbox20 contains approximately 20% of the messages and mbox80 contains 80% of the messages. Messages are assigned to each bin randomly. Usage: %(programs)s [options] sourcembox percent file1 file2 Options: -h / --help Print this help message and exit file1 and file2 are where the output goes. Approximately percent % of messages will go to file1 and (100 - percent) % of messages will go to file2. percent is a floating point number between 1 and 99. sourcembox is a Unix mailbox file. All arguments except -h/--help are required. """ import sys import random import mailbox import email import getopt import mboxutils program = sys.argv[0] def usage(code, msg=''): print >> sys.stderr, __doc__ if msg: print >> sys.stderr, msg sys.exit(code) def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'h', ['help']) except getopt.error, msg: usage(1, msg) bin1 = bin2 = percentage = mboxfile = None for opt, arg in opts: if opt in ('-h', '--help'): usage(0) try: mboxfile = args[0] percent = float(args[1]) if not (0 < percent < 100): raise ValueError percent /= 100.0 bin1 = args[2] bin2 = args[3] except IndexError: usage(1, 'Not enough arguments') except ValueError: usage(1, 'Percent argument must be a float between 1.0 and 99.0') # Cruise bin1out = open(bin1, 'wb') bin2out = open(bin2, 'wb') infp = open(mboxfile, 'rb') mbox = mailbox.PortableUnixMailbox(infp, mboxutils.get_message) for msg in mbox: if random.random() < percent: outfp = bin1out else: outfp = bin2out astext = str(msg) assert astext.endswith('\n') outfp.write(astext) outfp.close() bin1out.close() bin2out.close() if __name__ == '__main__': main() --- NEW FILE: splitn.py --- #! /usr/bin/env python """Split an mbox into N random mboxes. Usage: %(program)s [-h] [-s seed] [-v] -n N sourcembox outfilebase Options: -h / --help Print this help message and exit -s seed Seed the random number generator with seed (an integer). By default, use system time at startup to seed. -v Verbose. Displays a period for each 100 messages parsed. May display other stuff. -n N The number of output mboxes desired. This is required. Arguments: sourcembox The mbox to split. outfilebase The base path + name prefix for each of the N output files. Output mboxes have names of the form outfilebase + ("%%d.mbox" %% i) Example: %(program)s -s 123 -n5 spam.mbox rspam produces 5 mboxes, named rspam1.mbox through rspam5.mbox. Each contains a random selection of the messages in spam.mbox, and together they contain every message in spam.mbox exactly once. Each has approximately the same number of messages. spam.mbox is not altered. In addition, the seed for the random number generator is forced to 123, so that while the split is random, it's reproducible. """ import sys import random import mailbox import email import getopt import mboxutils try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 program = sys.argv[0] def usage(code, msg=''): print >> sys.stderr, __doc__ % globals() if msg: print >> sys.stderr, msg sys.exit(code) def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'hn:s:v', ['help']) except getopt.error, msg: usage(1, msg) n = None verbose = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt == '-s': random.seed(int(arg)) elif opt == '-n': n = int(arg) elif opt == '-v': verbose = True if n is None or n <= 1: usage(1, "an -n value > 1 is required") if len(args) != 2: usage(1, "input mbox name and output base path are required") inputpath, outputbasepath = args infile = file(inputpath, 'rb') outfiles = [file(outputbasepath + ("%d.mbox" % i), 'wb') for i in range(1, n+1)] mbox = mailbox.PortableUnixMailbox(infile, mboxutils.get_message) counter = 0 for msg in mbox: i = random.randrange(n) astext = str(msg) outfiles[i].write(astext) counter += 1 if verbose: if counter % 100 == 0: print '.', if verbose: print print counter, "messages split into", n, "files" infile.close() for f in outfiles: f.close() if __name__ == '__main__': main() --- NEW FILE: splitndirs.py --- #! /usr/bin/env python """Split an mbox into N random directories of files. Usage: %(program)s [-h] [-g] [-s seed] [-v] -n N sourcembox ... outdirbase Options: -h / --help Print this help message and exit -g Do globbing on each sourcepath. This is helpful on Windows, where the native shells don't glob, or when you have more mboxes than your shell allows you to specify on the commandline. -s seed Seed the random number generator with seed (an integer). By default, use system time at startup to seed. -v Verbose. Displays a period for each 100 messages parsed. May display other stuff. -n N The number of output mboxes desired. This is required. Arguments: sourcembox The mbox or path to an mbox to split. outdirbase The base path + name prefix for each of the N output dirs. Output files have names of the form outdirbase + ("Set%%d/%%d" %% (i, n)) Example: %(program)s -s 123 -n5 Data/spam.mbox Data/Spam/Set produces 5 directories, named Data/Spam/Set1 through Data/Spam/Set5. Each contains a random selection of the messages in spam.mbox, and together they contain every message in spam.mbox exactly once. Each has approximately the same number of messages. spam.mbox is not altered. In addition, the seed for the random number generator is forced to 123, so that while the split is random, it's reproducible. """ import sys import os import random import mailbox import email import getopt import glob import mboxutils try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 program = sys.argv[0] def usage(code, msg=''): print >> sys.stderr, __doc__ % globals() if msg: print >> sys.stderr, msg sys.exit(code) def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'hgn:s:v', ['help']) except getopt.error, msg: usage(1, msg) doglob = False n = None verbose = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt == '-g': doglob = True elif opt == '-s': random.seed(int(arg)) elif opt == '-n': n = int(arg) elif opt == '-v': verbose = True if n is None or n <= 1: usage(1, "an -n value > 1 is required") if len(args) < 2: usage(1, "input mbox name and output base path are required") inputpaths, outputbasepath = args[:-1], args[-1] outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)] for dir in outdirs: if not os.path.isdir(dir): os.makedirs(dir) counter = 0 for inputpath in inputpaths: if doglob: inpaths = glob.glob(inputpath) else: inpaths = [inputpath] for inpath in inpaths: mbox = mboxutils.getmbox(inpath) for msg in mbox: i = random.randrange(n) astext = str(msg) #assert astext.endswith('\n') counter += 1 msgfile = open('%s/%d' % (outdirs[i], counter), 'wb') msgfile.write(astext) msgfile.close() if verbose: if counter % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if verbose: print print counter, "messages split into", n, "directories" if __name__ == '__main__': main() From anthonybaxter at users.sourceforge.net Fri Jan 10 02:41:10 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 10 05:41:22 2003 Subject: [Spambayes-checkins] spambayes/testtools cmp.py,NONE,1.1.2.1 fpfn.py,NONE,1.1.2.1 mboxtest.py,NONE,1.1.2.1 rates.py,NONE,1.1.2.1 simplexloop.py,NONE,1.1.2.1 table.py,NONE,1.1.2.1 timcv.py,NONE,1.1.2.1 timtest.py,NONE,1.1.2.1weaktest.py,NONE,1.1.2.1 Message-ID: Update of /cvsroot/spambayes/spambayes/testtools In directory sc8-pr-cvs1:/tmp/cvs-serv9389/testtools Added Files: Tag: reorg-branch cmp.py fpfn.py mboxtest.py rates.py simplexloop.py table.py timcv.py timtest.py weaktest.py Log Message: Checkpointing before I head home. Still to do: - distutils magic to make sure that the 22compat modules are installed when needed. - Walking through testtools and utilities and fixing imports. - Documentation. hammie works, everything else that people use in day-to-day operation should work - please give it a go. --- NEW FILE: cmp.py --- #!/usr/bin/env python """ cmp.py sbase1 sbase2 Combines output from sbase1.txt and sbase2.txt, which are created by rates.py from timtest.py output, and displays comparison statistics to stdout. """ import sys f1n, f2n = sys.argv[1:3] # Return # (list of all f-p rates, # list of all f-n rates, # total f-p, # total f-n, # average f-p rate, # average f-n rate, # list of all ham score deviations, # list of all spam score deviations, # ham score deviation for all runs, # spam score deviations for all runs, # ) # from summary file f. def suck(f): fns = [] fps = [] hamdev = [] spamdev = [] hamdevall = spamdevall = (0.0, 0.0) get = f.readline while 1: line = get() if line.startswith('-> tested'): print line, if line.find(' items; mean ') != -1: # -> Ham distribution for this pair: 1000 items; mean 0.05; sample sdev 0.68 # and later "sample " went away vals = line.split(';') mean = float(vals[1].split()[-1]) sdev = float(vals[2].split()[-1]) val = (mean, sdev) typ = vals[0].split()[2] if line.find('for all runs') != -1: if typ == 'Ham': hamdevall = val else: spamdevall = val elif line.find('all in this') != -1: if typ == 'Ham': hamdev.append(val) else: spamdev.append(val) continue if line.startswith('-> '): continue if line.startswith('total'): break # A line with an f-p rate and an f-n rate. p, n = map(float, line.split()) fps.append(p) fns.append(n) # "total unique false pos 0" # "total unique false neg 0" # "average fp % 0.0" # "average fn % 0.0" fptot = int(line.split()[-1]) fntot = int(get().split()[-1]) fpmean = float(get().split()[-1]) fnmean = float(get().split()[-1]) return (fps, fns, fptot, fntot, fpmean, fnmean, hamdev, spamdev, hamdevall, spamdevall) def tag(p1, p2): if p1 == p2: t = "tied " else: t = p1 < p2 and "lost " or "won " if p1: p = (p2 - p1) * 100.0 / p1 t += " %+7.2f%%" % p else: t += " +(was 0)" return t def mtag(m1, m2): mean1, dev1 = m1 mean2, dev2 = m2 t = "%7.2f %7.2f " % (mean1, mean2) if mean1: mp = (mean2 - mean1) * 100.0 / mean1 t += "%+7.2f%%" % mp else: t += "+(was 0)" t += " %7.2f %7.2f " % (dev1, dev2) if dev1: dp = (dev2 - dev1) * 100.0 / dev1 t += "%+7.2f%%" % dp else: t += "+(was 0)" return t def dump(p1s, p2s): alltags = "" for p1, p2 in zip(p1s, p2s): t = tag(p1, p2) print " %5.3f %5.3f %s" % (p1, p2, t) alltags += t + " " print for t in "won", "tied", "lost": print "%-4s %2d times" % (t, alltags.count(t)) print def dumpdev(meandev1, meandev2): for m1, m2 in zip(meandev1, meandev2): print mtag(m1, m2) def windowsfy(fn): import os if os.path.exists(fn + '.txt'): return fn + '.txt' else: return fn print f1n, '->', f2n f1n = windowsfy(f1n) f2n = windowsfy(f2n) (fp1, fn1, fptot1, fntot1, fpmean1, fnmean1, hamdev1, spamdev1, hamdevall1, spamdevall1) = suck(file(f1n)) (fp2, fn2, fptot2, fntot2, fpmean2, fnmean2, hamdev2, spamdev2, hamdevall2, spamdevall2) = suck(file(f2n)) print print "false positive percentages" dump(fp1, fp2) print "total unique fp went from", fptot1, "to", fptot2, tag(fptot1, fptot2) print "mean fp % went from", fpmean1, "to", fpmean2, tag(fpmean1, fpmean2) print print "false negative percentages" dump(fn1, fn2) print "total unique fn went from", fntot1, "to", fntot2, tag(fntot1, fntot2) print "mean fn % went from", fnmean1, "to", fnmean2, tag(fnmean1, fnmean2) print if len(hamdev1) == len(hamdev2) and len(spamdev1) == len(spamdev2): print "ham mean ham sdev" dumpdev(hamdev1, hamdev2) print print "ham mean and sdev for all runs" dumpdev([hamdevall1], [hamdevall2]) print print "spam mean spam sdev" dumpdev(spamdev1, spamdev2) print print "spam mean and sdev for all runs" dumpdev([spamdevall1], [spamdevall2]) print diff1 = spamdevall1[0] - hamdevall1[0] diff2 = spamdevall2[0] - hamdevall2[0] print "ham/spam mean difference: %2.2f %2.2f %+2.2f" % (diff1, diff2, diff2 - diff1) else: print "[info about ham & spam means & sdevs not available in both files]" --- NEW FILE: fpfn.py --- #! /usr/bin/env python """Extract false positive and false negative filenames from timcv.py output.""" import sys import re def cmpf(a, b): # Sort function that sorts by numerical value ma = re.search(r'(\d+)/(\d+)$', a) mb = re.search(r'(\d+)/(\d+)$', b) if ma and mb: xa, ya = map(int, ma.groups()) xb, yb = map(int, mb.groups()) return cmp((xa, ya), (xb, yb)) else: return cmp(a, b) def main(): for name in sys.argv[1:]: try: f = open(name + ".txt") except IOError: f = open(name) print "===", name, "===" fp = [] fn = [] for line in f: if line.startswith(' new fp: '): fp.extend(eval(line[12:])) elif line.startswith(' new fn: '): fn.extend(eval(line[12:])) fp.sort(cmpf) fn.sort(cmpf) print "--- fp ---" for x in fp: print x print "--- fn ---" for x in fn: print x if __name__ == '__main__': main() --- NEW FILE: mboxtest.py --- #! /usr/bin/env python """mboxtest.py: A test driver for classifier. Usage: mboxtest.py [options] Options: -f FMT One of unix, mmdf, mh, or qmail. Specifies mailbox format for ham and spam files. Default is unix. -n NSETS Number of test sets to create for a single mailbox. Default is 5. -s SEED Seed for random number generator. Default is 101. -m MSGS Read no more than MSGS messages from mailbox. """ from __future__ import generators import getopt import mailbox import random import re from sets import Set import sys from spambayes.tokenizer import tokenize from spambayes.TestDriver import Driver from spambayes.msgs import Msg from spambayes.Options import options try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 mbox_fmts = {"unix": mailbox.PortableUnixMailbox, "mmdf": mailbox.MmdfMailbox, "mh": mailbox.MHMailbox, "qmail": mailbox.Maildir, } class MboxMsg(Msg): def __init__(self, fp, path, index): self.guts = fp.read() self.tag = "%s:%s %s" % (path, index, subject(self.guts)) def __str__(self): lines = [] i = 0 for line in self.guts.split("\n"): skip = False for skip_prefix in 'X-', 'Received:', '\t',: if line.startswith(skip_prefix): skip = True if skip: continue i += 1 if i > 100: lines.append("... truncated") break lines.append(line) return "\n".join(lines) def __iter__(self): return tokenize(self.guts) class mbox(object): def __init__(self, path, indices=None): self.path = path self.indices = {} self.key = '' if indices is not None: self.key = " %s" % indices[0] for i in indices: self.indices[i] = 1 def __repr__(self): return "" % (self.path, self.key) def __iter__(self): # Use a simple factory that just produces a string. mbox = mbox_fmts[FMT](open(self.path, "rb"), lambda f: MboxMsg(f, self.path, i)) i = 0 while 1: msg = mbox.next() if msg is None: return i += 1 if self.indices.get(i-1) or not self.indices: yield msg def subject(buf): buf = buf.lower() i = buf.find('subject:') j = buf.find("\n", i) return buf[i:j] def randindices(nelts, nresults): L = range(nelts) random.shuffle(L) chunk = nelts / nresults for i in range(nresults): yield Set(L[:chunk]) del L[:chunk] def sort(seq): L = list(seq) L.sort() return L def main(args): global FMT print options.display() FMT = "unix" NSETS = 10 SEED = 101 MAXMSGS = None opts, args = getopt.getopt(args, "f:n:s:m:") for k, v in opts: if k == '-f': FMT = v if k == '-n': NSETS = int(v) if k == '-s': SEED = int(v) if k == '-m': MAXMSGS = int(v) ham, spam = args random.seed(SEED) nham = len(list(mbox(ham))) nspam = len(list(mbox(spam))) if MAXMSGS: nham = min(nham, MAXMSGS) nspam = min(nspam, MAXMSGS) print "ham", ham, nham print "spam", spam, nspam ihams = map(tuple, randindices(nham, NSETS)) ispams = map(tuple, randindices(nspam, NSETS)) driver = Driver() for i in range(1, NSETS): driver.train(mbox(ham, ihams[i]), mbox(spam, ispams[i])) i = 0 for iham, ispam in zip(ihams, ispams): hams = mbox(ham, iham) spams = mbox(spam, ispam) if i > 0: driver.untrain(hams, spams) driver.test(hams, spams) driver.finishtest() if i < NSETS - 1: driver.train(hams, spams) i += 1 driver.alldone() if __name__ == "__main__": sys.exit(main(sys.argv[1:])) --- NEW FILE: rates.py --- #!/usr/bin/env python """ rates.py basename ... Assuming that file basename + '.txt' or basename contains output from one of the test drivers (timcv, mboxtest, timtest), scans that file for summary statistics, displays them to stdout, and also writes them to file basename + 's.txt' (where the 's' means 'summary'). This doesn't need a full output file from a test run, and will display stuff for as far as the output file has gotten so far. Two of these summary files can later be fed to cmp.py. """ import sys """ -> Training on Data/Ham/Set2-3 & Data/Spam/Set2-3 ... 8000 hams & 5500 spams -> Predicting Data/Ham/Set1 & Data/Spam/Set1 ... -> tested 4000 hams & 2750 spams against 8000 hams & 5500 spams -> false positive %: 0.025 -> false negative %: 0.327272727273 -> 1 new false positives """ def doit(basename): if basename.endswith('.txt'): basename = basename[:-4] try: ifile = file(basename + '.txt') except IOError: ifile = file(basename) interesting = filter(lambda line: line.startswith('-> '), ifile) ifile.close() oname = basename + 's.txt' ofile = file(oname, 'w') print basename, '->', oname def dump(*stuff): msg = ' '.join(map(str, stuff)) print msg print >> ofile, msg ntests = nfn = nfp = 0 sumfnrate = sumfprate = 0.0 for line in interesting: dump(line[:-1]) fields = line.split() # 0 1 2 3 4 5 6 -5 -4 -3 -2 -1 #-> tested 4000 hams & 2750 spams against 8000 hams & 5500 spams if line.startswith('-> tested '): ntests += 1 continue # 0 1 2 3 # -> false positive %: 0.025 # -> false negative %: 0.327272727273 if line.startswith('-> false '): kind = fields[3] percent = float(fields[-1]) if kind == 'positive': sumfprate += percent lastval = percent else: sumfnrate += percent dump(' %7.3f %7.3f' % (lastval, percent)) continue # 0 1 2 3 4 5 # -> 1 new false positives if len(fields) >= 5 and fields[3] == 'new' and fields[4] == 'false': kind = fields[-1] count = int(fields[2]) if kind == 'positives': nfp += count else: nfn += count dump('total unique false pos', nfp) dump('total unique false neg', nfn) dump('average fp %', sumfprate / ntests) dump('average fn %', sumfnrate / ntests) for name in sys.argv[1:]: doit(name) --- NEW FILE: simplexloop.py --- # # Optimize parameters # """Usage: %(program)s [options] -c command Where: -h Show usage and exit. -c command The command to be run, with all its options. The last line of output from this program should match 'YYYYYYY cost: $xxxx.xx' (i.e. the third word of the last line should be the value to be minimized, preceded by a dollar sign) I have used "python2.3 timcv.py -n 10 --spam-keep=600 --ham-keep=600 -s 12345" This program will overwrite bayescustomize.ini! """ import sys def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) program = sys.argv[0] from spambayes import Options start = (Options.options.unknown_word_prob, Options.options.minimum_prob_strength, Options.options.unknown_word_strength) err = (0.01, 0.01, 0.01) def mkini(vars): f=open('bayescustomize.ini', 'w') f.write(""" [Classifier] unknown_word_prob = %.6f minimum_prob_strength = %.6f unknown_word_strength = %.6f """%tuple(vars)) f.close() def score(vars): import os mkini(vars) status = os.system('%s > loop.out'%command) if status != 0: print >> sys.stderr, "Error status from subcommand" sys.exit(status) f = open('loop.out', 'r') txt = f.readlines() # Extract the flex cost field. cost = float(txt[-1].split()[2][1:]) f.close() os.rename('loop.out','loop.out.old') print ''.join(txt[-20:])[:-1] print "x=%.4f p=%.4f s=%.4f %.2f"%(tuple(vars)+(cost,)) sys.stdout.flush() return -cost def main(): import spambayes.optimize finish=spambayes.optimize.SimplexMaximize(start,err,score) mkini(finish) print "Best result left in bayescustomize.ini" if __name__ == "__main__": import getopt try: opts, args = getopt.getopt(sys.argv[1:], 'hc:') except getopt.error, msg: usage(1, msg) command = None for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-c': command = arg if args: usage(1, "Positional arguments not supported") if command is None: usage(1, "-c is required") main() --- NEW FILE: table.py --- #!/usr/bin/env python """ table.py [-m] base1 base2 ... baseN Combines output from base1.txt, base2.txt, etc., which are created by the TestDriver (such as timcv.py) output, and displays tabulated comparison statistics to stdout. Each input file is represented by one column in the table. Optional argument -m shows a final column with the mean value of each statistic. """ # Return # ( # ham tested, # spam tested, # total f-p, # total f-n, # total unsure, # average f-p rate, # average f-n rate, # average unsure rate, # real cost, # best cost, # ham score deviation for all runs, # spam score deviations for all runs, # ) # from summary file f. def suck(f): hamdevall = spamdevall = (0.0, 0.0) cost = 0.0 bestcost = 0.0 fp = 0 fn = 0 un = 0 fpp = 0.0 fnp = 0.0 unp = 0.0 htest = 0 stest = 0 get = f.readline while 1: line = get() if line.startswith('-> tested'): # tested 1910 hams & 948 spams against 2741 hams & 948 spams # 1 2 3 4 5 6 print line, elif line.find(' items; mean ') > 0 and line.find('for all runs') > 0: # Ham scores for all runs: 2741 items; mean 0.86; sdev 6.28 # 0 1 2 vals = line.split(';') mean = float(vals[1].split()[-1]) sdev = float(vals[2].split()[-1]) val = (mean, sdev) ntested = int(vals[0].split()[-2]) typ = vals[0].split()[2] if line.find('for all runs') != -1: if typ == 'Ham': hamdevall = val htest = ntested else: spamdevall = val stest = ntested elif line.startswith('-> best cost for all runs: $'): # -> best cost for all runs: $28.20 bestcost = float(line.split('$')[-1]) elif line.startswith('-> all runs false positives: '): fp = int(line.split()[-1]) elif line.startswith('-> all runs false negatives: '): fn = int(line.split()[-1]) elif line.startswith('-> all runs unsure: '): un = int(line.split()[-1]) elif line.startswith('-> all runs false positive %: '): fpp = float(line.split()[-1]) elif line.startswith('-> all runs false negative %: '): fnp = float(line.split()[-1]) elif line.startswith('-> all runs unsure %: '): unp = float(line.split()[-1]) elif line.startswith('-> all runs cost: '): cost = float(line.split('$')[-1]) break return (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, hamdevall, spamdevall) def windowsfy(fn): import os if os.path.exists(fn + '.txt'): return fn + '.txt' else: return fn def table(): import getopt, sys showMean = 0 fname = "filename: " fnam2 = " " ratio = "ham:spam: " rat2 = " " fptot = "fp total: " fpper = "fp %: " fntot = "fn total: " fnper = "fn %: " untot = "unsure t: " unper = "unsure %: " rcost = "real cost:" bcost = "best cost:" hmean = "h mean: " hsdev = "h sdev: " smean = "s mean: " ssdev = "s sdev: " meand = "mean diff:" kval = "k: " tfptot = tfpper = tfntot = tfnper = tuntot = tunper = trcost = tbcost = \ thmean = thsdev = tsmean = tssdev = tmeand = tkval = 0 args, fileargs = getopt.getopt(sys.argv[1:], 'm') for arg, val in args: if arg == "-m": showMean = 1 for filename in fileargs: filename = windowsfy(filename) (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, hamdevall, spamdevall) = suck(file(filename)) if filename.endswith('.txt'): filename = filename[:-4] filename = filename[filename.rfind('/')+1:] filename = filename[filename.rfind("\\")+1:] if len(fname) > len(fnam2): fname += " " fname = fname[0:(len(fnam2) + 8)] fnam2 += " %7s" % filename else: fnam2 += " " fnam2 = fnam2[0:(len(fname) + 8)] fname += " %7s" % filename if len(ratio) > len(rat2): ratio += " " ratio = ratio[0:(len(rat2) + 8)] rat2 += " %7s" % ("%d:%d" % (htest, stest)) else: rat2 += " " rat2 = rat2[0:(len(ratio) + 8)] ratio += " %7s" % ("%d:%d" % (htest, stest)) fptot += "%8d" % fp tfptot += fp fpper += "%8.2f" % fpp tfpper += fpp fntot += "%8d" % fn tfntot += fn fnper += "%8.2f" % fnp tfnper += fnp untot += "%8d" % un tuntot += un unper += "%8.2f" % unp tunper += unp rcost += "%8s" % ("$%.2f" % cost) trcost += cost bcost += "%8s" % ("$%.2f" % bestcost) tbcost += bestcost hmean += "%8.2f" % hamdevall[0] thmean += hamdevall[0] hsdev += "%8.2f" % hamdevall[1] thsdev += hamdevall[1] smean += "%8.2f" % spamdevall[0] tsmean += spamdevall[0] ssdev += "%8.2f" % spamdevall[1] tssdev += spamdevall[1] meand += "%8.2f" % (spamdevall[0] - hamdevall[0]) tmeand += (spamdevall[0] - hamdevall[0]) k = (spamdevall[0] - hamdevall[0]) / (spamdevall[1] + hamdevall[1]) kval += "%8.2f" % k tkval += k nfiles = len(fileargs) if nfiles and showMean: fptot += "%12d" % (tfptot/nfiles) fpper += "%12.2f" % (tfpper/nfiles) fntot += "%12d" % (tfntot/nfiles) fnper += "%12.2f" % (tfnper/nfiles) untot += "%12d" % (tuntot/nfiles) unper += "%12.2f" % (tunper/nfiles) rcost += "%12s" % ("$%.2f" % (trcost/nfiles)) bcost += "%12s" % ("$%.2f" % (tbcost/nfiles)) hmean += "%12.2f" % (thmean/nfiles) hsdev += "%12.2f" % (thsdev/nfiles) smean += "%12.2f" % (tsmean/nfiles) ssdev += "%12.2f" % (tssdev/nfiles) meand += "%12.2f" % (tmeand/nfiles) kval += "%12.2f" % (tkval/nfiles) print fname if len(fnam2.strip()) > 0: print fnam2 print ratio if len(rat2.strip()) > 0: print rat2 print fptot print fpper print fntot print fnper print untot print unper print rcost print bcost print hmean print hsdev print smean print ssdev print meand print kval if __name__ == "__main__": table() --- NEW FILE: timcv.py --- #! /usr/bin/env python # A driver for N-fold cross validation. """Usage: %(program)s [options] -n nsets Where: -h Show usage and exit. -n int Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...). This is required. If you only want to use some of the messages in each set, --HamTrain int The maximum number of msgs to use from each Ham set for training. The msgs are chosen randomly. See also the -s option. --SpamTrain int The maximum number of msgs to use from each Spam set for training. The msgs are chosen randomly. See also the -s option. --HamTest int The maximum number of msgs to use from each Ham set for testing. The msgs are chosen randomly. See also the -s option. --SpamTest int The maximum number of msgs to use from each Spam set for testing. The msgs are chosen randomly. See also the -s option. --ham-keep int The maximum number of msgs to use from each Ham set for testing and training. The msgs are chosen randomly. See also the -s option. --spam-keep int The maximum number of msgs to use from each Spam set for testing and training. The msgs are chosen randomly. See also the -s option. -s int A seed for the random number generator. Has no effect unless at least on of {--ham-keep, --spam-keep} is specified. If -s isn't specifed, the seed is taken from current time. In addition, an attempt is made to merge bayescustomize.ini into the options. If that exists, it can be used to change the settings in Options.options. """ from __future__ import generators import sys from spambayes.Options import options from spambayes import TestDriver from spambayes import msgs program = sys.argv[0] def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def drive(nsets): print options.display() hamdirs = [options.ham_directories % i for i in range(1, nsets+1)] spamdirs = [options.spam_directories % i for i in range(1, nsets+1)] d = TestDriver.Driver() # Train it on all sets except the first. d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:], train=1), msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:], train=1)) # Now run nsets times, predicting pair i against all except pair i. for i in range(nsets): h = hamdirs[i] s = spamdirs[i] hamstream = msgs.HamStream(h, [h], train=0) spamstream = msgs.SpamStream(s, [s], train=0) if i > 0: if options.build_each_classifier_from_scratch: # Build a new classifier from the other sets. d.new_classifier() hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1) h2 = hamdirs[:] del h2[i] sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1) s2 = spamdirs[:] del s2[i] d.train(msgs.HamStream(hname, h2, train=1), msgs.SpamStream(sname, s2, train=1)) else: # Forget this set. d.untrain(hamstream, spamstream) # Predict this set. d.test(hamstream, spamstream) d.finishtest() if i < nsets - 1 and not options.build_each_classifier_from_scratch: # Add this set back in. d.train(hamstream, spamstream) d.alldone() def main(): import getopt try: opts, args = getopt.getopt(sys.argv[1:], 'hn:s:', ['HamTrain=', 'SpamTrain=', 'HamTest=', 'SpamTest=', 'ham-keep=', 'spam-keep=']) except getopt.error, msg: usage(1, msg) nsets = seed = hamtrain = spamtrain = None hamtest = spamtest = hamkeep = spamkeep = None for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-n': nsets = int(arg) elif opt == '-s': seed = int(arg) elif opt == '--HamTest': hamtest = int(arg) elif opt == '--SpamTest': spamtest = int(arg) elif opt == '--HamTrain': hamtrain = int(arg) elif opt == '--SpamTrain': spamtrain = int(arg) elif opt == '--ham-keep': hamkeep = int(arg) elif opt == '--spam-keep': spamkeep = int(arg) if args: usage(1, "Positional arguments not supported") if nsets is None: usage(1, "-n is required") if hamkeep is not None: msgs.setparms(hamkeep, spamkeep, seed=seed) else: msgs.setparms(hamtrain, spamtrain, hamtest, spamtest, seed) drive(nsets) if __name__ == "__main__": main() --- NEW FILE: timtest.py --- #! /usr/bin/env python # A test driver using "the standard" test directory structure. See also # rates.py and cmp.py for summarizing results. This runs an NxN test grid, # skipping the diagonal. """Usage: %(program)s [options] -n nsets Where: -h Show usage and exit. -n int Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...). This is required. If you only want to use some of the messages in each set, --ham-keep int The maximum number of msgs to use from each Ham set. The msgs are chosen randomly. See also the -s option. --spam-keep int The maximum number of msgs to use from each Spam set. The msgs are chosen randomly. See also the -s option. -s int A seed for the random number generator. Has no effect unless at least on of {--ham-keep, --spam-keep} is specified. If -s isn't specifed, the seed is taken from current time. In addition, an attempt is made to merge bayescustomize.ini into the options. If that exists, it can be used to change the settings in Options.options. """ from __future__ import generators import sys from spambayes.Options import options from spambayes import TestDriver from spambayes import msgs program = sys.argv[0] def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def drive(nsets): print options.display() spamdirs = [options.spam_directories % i for i in range(1, nsets+1)] hamdirs = [options.ham_directories % i for i in range(1, nsets+1)] spamhamdirs = zip(spamdirs, hamdirs) d = TestDriver.Driver() for spamdir, hamdir in spamhamdirs: d.new_classifier() d.train(msgs.HamStream(hamdir, [hamdir]), msgs.SpamStream(spamdir, [spamdir])) for sd2, hd2 in spamhamdirs: if (sd2, hd2) == (spamdir, hamdir): continue d.test(msgs.HamStream(hd2, [hd2]), msgs.SpamStream(sd2, [sd2])) d.finishtest() d.alldone() def main(): import getopt try: opts, args = getopt.getopt(sys.argv[1:], 'hn:s:', ['ham-keep=', 'spam-keep=']) except getopt.error, msg: usage(1, msg) nsets = seed = hamkeep = spamkeep = None for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-n': nsets = int(arg) elif opt == '-s': seed = int(arg) elif opt == '--ham-keep': hamkeep = int(arg) elif opt == '--spam-keep': spamkeep = int(arg) if args: usage(1, "Positional arguments not supported") if nsets is None: usage(1, "-n is required") msgs.setparms(hamkeep, spamkeep, seed=seed) drive(nsets) if __name__ == "__main__": main() --- NEW FILE: weaktest.py --- #! /usr/bin/env python # A test driver using "the standard" test directory structure. # This simulates a user that gets E-mail, and only trains on fp, # fn and unsure messages. It starts by training on the first 30 # messages, and from that point on well classified messages will # not be used for training. This can be used to see what the performance # of the scoring algorithm is under such conditions. Questions are: # * How does the size of the database behave over time? # * Does the classification get better over time? # * Are there other combinations of parameters for the classifier # that make this better behaved than the default values? """Usage: %(program)s [options] -n nsets Where: -h Show usage and exit. -n int Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...). This is required. -d decider Name of the decider. One of %(decisionkeys)s -m min Minimal number of messages to train on before involving the decider. In addition, an attempt is made to merge bayescustomize.ini into the options. If that exists, it can be used to change the settings in Options.options. """ from __future__ import generators import sys,os from spambayes.Options import options from spambayes import hammie, msgs, CostCounter program = sys.argv[0] debug = 0 def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) DONT_TRAIN = None TRAIN_AS_HAM = 1 TRAIN_AS_SPAM = 2 class TrainDecision: def __call__(self,scr,is_spam): if is_spam: return self.spamtrain(scr) else: return self.hamtrain(scr) class UnsureAndFalses(TrainDecision): def spamtrain(self,scr): if scr < options.spam_cutoff: return TRAIN_AS_SPAM def hamtrain(self,scr): if scr > options.ham_cutoff: return TRAIN_AS_HAM class UnsureOnly(TrainDecision): def spamtrain(self,scr): if options.ham_cutoff < scr < options.spam_cutoff: return TRAIN_AS_SPAM def hamtrain(self,scr): if options.ham_cutoff < scr < options.spam_cutoff: return TRAIN_AS_HAM class All(TrainDecision): def spamtrain(self,scr): return TRAIN_AS_SPAM def hamtrain(self,scr): return TRAIN_AS_HAM class AllBut0and100(TrainDecision): def spamtrain(self,scr): if scr < 0.995: return TRAIN_AS_SPAM def hamtrain(self,scr): if scr > 0.005: return TRAIN_AS_HAM class OwnDecision(TrainDecision): def hamtrain(self,scr): if scr < options.ham_cutoff: return TRAIN_AS_HAM elif scr > options.spam_cutoff: return TRAIN_AS_SPAM spamtrain = hamtrain class OwnDecisionFNCorrection(OwnDecision): def spamtrain(self,scr): return TRAIN_AS_SPAM decisions={'all': All, 'allbut0and100': AllBut0and100, 'unsureonly': UnsureOnly, 'unsureandfalses': UnsureAndFalses, 'owndecision': OwnDecision, 'owndecision+fn': OwnDecisionFNCorrection, } decisionkeys=decisions.keys() decisionkeys.sort() class FirstN: def __init__(self,n,client): self.client = client self.x = 0 self.n = n def __call__(self,scr,is_spam): self.x += 1 if self.tooearly(): if is_spam: return TRAIN_AS_SPAM else: return TRAIN_AS_HAM else: return self.client(scr,is_spam) def tooearly(self): return self.x < self.n class Updater: def __init__(self,d=None): self.setd(d) def setd(self,d): self.d=d def drive(nsets,decision): print options.display() spamdirs = [options.spam_directories % i for i in range(1, nsets+1)] hamdirs = [options.ham_directories % i for i in range(1, nsets+1)] spamfns = [(x,y,1) for x in spamdirs for y in os.listdir(x)] hamfns = [(x,y,0) for x in hamdirs for y in os.listdir(x)] nham = len(hamfns) nspam = len(spamfns) cc = CostCounter.nodelay() allfns = {} for fn in spamfns+hamfns: allfns[fn] = None d = hammie.open('weaktest.db', False) hamtrain = 0 spamtrain = 0 n = 0 for dir,name, is_spam in allfns.iterkeys(): n += 1 m=msgs.Msg(dir, name).guts if debug > 1: print "trained:%dH+%dS"%(hamtrain,spamtrain) scr=d.score(m) if debug > 1: print "score:%.3f"%scr if not decision.tooearly(): if is_spam: if debug > 0: print "Spam with score %.2f"%scr cc.spam(scr) else: if debug > 0: print "Ham with score %.2f"%scr cc.ham(scr) de = decision(scr,is_spam) if de == TRAIN_AS_SPAM: d.train_spam(m) spamtrain += 1 elif de == TRAIN_AS_HAM: d.train_ham(m) hamtrain += 1 if n % 100 == 0: print "%5d trained:%dH+%dS wrds:%d"%( n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc print "="*70 print "%5d trained:%dH+%dS wrds:%d"%( n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc def main(): global debug import getopt try: opts, args = getopt.getopt(sys.argv[1:], 'vd:hn:m:') except getopt.error, msg: usage(1, msg) nsets = None decision = decisions['unsureonly'] m = 10 for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-n': nsets = int(arg) elif opt == '-v': debug += 1 elif opt == '-m': m = int(arg) elif opt == '-d': if not decisions.has_key(arg): usage(1,'Unknown decisionmaker') decision = decisions[arg] if args: usage(1, "Positional arguments not supported") if nsets is None: usage(1, "-n is required") drive(nsets,decision=FirstN(m,decision())) if __name__ == "__main__": main() From anthonybaxter at users.sourceforge.net Fri Jan 10 02:41:10 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Sat Jan 11 12:30:41 2003 Subject: [Spambayes-checkins] spambayes/spambayes Corpus.py,NONE,1.1.2.1 CostCounter.py,NONE,1.1.2.1 FileCorpus.py,NONE,1.1.2.1 Histogram.py,NONE,1.1.2.1 Options.py,NONE,1.1.2.1 TestDriver.py,NONE,1.1.2.1 Tester.py,NONE,1.1.2.1 __init__.py,NONE,1.1.2.1 cdb.py,NONE,1.1.2.1 chi2.py,NONE,1.1.2.1 classifier.py,NONE,1.1.2.1 dbmstorage.py,NONE,1.1.2.1 hammie.py,NONE,1.1.2.1 hammiebulk.py,NONE,1.1.2.1 mboxutils.py,NONE,1.1.2.1 msgs.py,NONE,1.1.2.1 optimize.py,NONE,1.1.2.1 storage.py,NONE,1.1.2.1tokenizer.py,NONE,1.1.2.1 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9389/spambayes Added Files: Tag: reorg-branch Corpus.py CostCounter.py FileCorpus.py Histogram.py Options.py TestDriver.py Tester.py __init__.py cdb.py chi2.py classifier.py dbmstorage.py hammie.py hammiebulk.py mboxutils.py msgs.py optimize.py storage.py tokenizer.py Log Message: Checkpointing before I head home. Still to do: - distutils magic to make sure that the 22compat modules are installed when needed. - Walking through testtools and utilities and fixing imports. - Documentation. hammie works, everything else that people use in day-to-day operation should work - please give it a go. --- NEW FILE: Corpus.py --- #! /usr/bin/env python '''Corpus.py - Spambayes corpus management framework. Classes: Corpus - a collection of Messages ExpiryCorpus - a "young" Corpus Message - a subject of Spambayes training MessageFactory - creates a Message Abstract: A corpus is defined as a set of messages that share some common characteristic relative to spamness. Examples might be spam, ham, unsure, or untrained, or "bayes rating between .4 and .6. A corpus is a collection of messages. Corpus is a dictionary that is keyed by the keys of the messages within it. It is iterable, and observable. Observers are notified when a message is added to or removed from the corpus. Corpus is designed to cache message objects. By default, it will only engage in lazy creation of message objects, keeping those objects in memory until the corpus instance itself is destroyed. In large corpora, this could consume a large amount of memory. A cacheSize operand is implemented on the constructor, which is used to limit the *number* of messages currently loaded into memory. The instance variable that implements this cache is Corpus.Corpus.msgs, a dictionary. Access to this variable should be through keys(), [key], or using an iterator. Direct access should not be used, as subclasses that manage their cache may use this variable very differently. Iterating Corpus objects is potentially very expensive, as each message in the corpus will be brought into memory. For large corpora, this could consume a lot of system resources. ExpiryCorpus is designed to keep a corpus of file messages that are guaranteed to be younger than a given age. The age is specified on the constructor, as a number of seconds in the past. If a message file was created before that point in time, the a message is deemed to be "old" and thus ignored. Access to a message that is deemed to be old will raise KeyError, which should be handled by the corpus user as appropriate. While iterating, KeyError is handled by the iterator, and messages that raise KeyError are ignored. As messages pass their "expiration date," they are eligible for removal from the corpus. To remove them properly, removeExpiredMessages() should be called. As messages are removed, observers are notified. ExpiryCorpus function is included into a concrete Corpus through multiple inheritance. It must be inherited before any inheritance that derives from Corpus. For example: class RealCorpus(Corpus) ... class ExpiryRealCorpus(Corpus.ExpiryCorpus, RealCorpus) ... Messages have substance, which is is the textual content of the message. They also have a key, which uniquely defines them within the corpus. This framework makes no assumptions about how or if messages persist. MessageFactory is a required factory class, because Corpus is designed to do lazy initialization of messages and as an abstract class, must know how to create concrete instances of the correct class. To Do: o Suggestions? ''' # This module is part of the spambayes project, which is Copyright 2002 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Tim Stone " __credits__ = "Richie Hindle, Tim Peters, all the spambayes contributors." from __future__ import generators try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 def bool(val): return not not val import sys # for output of docstring import time import re from spambayes import tokenizer from spambayes.Options import options SPAM = True HAM = False class Corpus: '''An observable dictionary of Messages''' def __init__(self, factory, cacheSize=-1): '''Constructor(MessageFactory)''' self.msgs = {} # dict of all messages in corpus # value is None if msg not currently loaded self.keysInMemory = [] # keys of messages currently loaded # this *could* be derived by iterating msgs self.cacheSize = cacheSize # max number of messages in memory self.observers = [] # observers of this corpus self.factory = factory # factory for the correct Message subclass def addObserver(self, observer): '''Register an observer, which must implement onAddMessage, onRemoveMessage''' self.observers.append(observer) def addMessage(self, message): '''Add a Message to this corpus''' if options.verbose: print 'adding message %s to corpus' % (message.key()) self.cacheMessage(message) for obs in self.observers: # there is no reason that a Corpus observer MUST be a Trainer # and so it may very well not be interested in AddMessage events # even though right now the only observable events are # training related try: obs.onAddMessage(message) except AttributeError: # ignore if not implemented pass def removeMessage(self, message): '''Remove a Message from this corpus''' key = message.key() if options.verbose: print 'removing message %s from corpus' % (key) self.unCacheMessage(key) del self.msgs[key] for obs in self.observers: # see comments in event loop in addMessage try: obs.onRemoveMessage(message) except AttributeError: pass def cacheMessage(self, message): '''Add a message to the in-memory cache''' # This method should probably not be overridden key = message.key() if options.verbose: print 'placing %s in corpus cache' % (key) self.msgs[key] = message # Here is where we manage the in-memory cache size... self.keysInMemory.append(key) if self.cacheSize > 0: # performance optimization if len(self.keysInMemory) > self.cacheSize: keyToFlush = self.keysInMemory[0] self.unCacheMessage(keyToFlush) def unCacheMessage(self, key): '''Remove a message from the in-memory cache''' # This method should probably not be overridden if options.verbose: print 'Flushing %s from corpus cache' % (key) try: ki = self.keysInMemory.index(key) except ValueError: pass else: del self.keysInMemory[ki] self.msgs[key] = None def takeMessage(self, key, fromcorpus): '''Move a Message from another corpus to this corpus''' # XXX Hack: Calling msg.getSubstance() here ensures that the # message substance is in memory. If it isn't, when addMessage() # calls message.store(), which calls message.getSubstance(), that # will try to load the substance from the as-yet-unwritten new file. msg = fromcorpus[key] msg.getSubstance() fromcorpus.removeMessage(msg) self.addMessage(msg) def __getitem__(self, key): '''Corpus is a dictionary''' amsg = self.msgs[key] if not amsg: amsg = self.makeMessage(key) # lazy init, saves memory self.cacheMessage(amsg) return amsg def keys(self): '''Message keys in the Corpus''' return self.msgs.keys() def __iter__(self): '''Corpus is iterable''' for key in self.keys(): try: yield self[key] except KeyError: pass def __str__(self): '''Instance as a printable string''' return self.__repr__() def __repr__(self): '''Instance as a representative string''' raise NotImplementedError def makeMessage(self, key): '''Call the factory to make a message''' # This method will likely be overridden msg = self.factory.create(key) return msg class ExpiryCorpus: '''Corpus of "young" file system artifacts''' def __init__(self, expireBefore): '''Constructor''' self.expireBefore = expireBefore def removeExpiredMessages(self): '''Kill expired messages''' for msg in self: if msg.createTimestamp() < time.time() - self.expireBefore: if options.verbose: print 'message %s has expired' % (key) self.removeMessage(msg) class Message: '''Abstract Message class''' def __init__(self): '''Constructor()''' # The text of the message headers and body are held in attributes # called 'hdrtxt' and 'payload', created on demand in __getattr__ # by calling load(), which should in turn call setSubstance(). # This means you don't need to remember to call load() before # using these attributes. def __getattr__(self, attributeName): '''On-demand loading of the message text.''' if attributeName in ('hdrtxt', 'payload'): self.load() return getattr(self, attributeName) def load(self): '''Method to load headers and body''' raise NotImplementedError def store(self): '''Method to persist a message''' raise NotImplementedError def remove(self): '''Method to obliterate a message''' raise NotImplementedError def __repr__(self): '''Instance as a representative string''' raise NotImplementedError def __str__(self): '''Instance as a printable string''' return self.getSubstance() def name(self): '''Message may have a unique human readable name''' return self.__repr__() def key(self): '''The key for this instance''' raise NotImplementedError def setSubstance(self, sub): '''set this message substance''' bodyRE = re.compile(r"\r?\n(\r?\n)(.*)", re.DOTALL+re.MULTILINE) bmatch = bodyRE.search(sub) if bmatch: self.payload = bmatch.group(2) self.hdrtxt = sub[:bmatch.start(2)] def getSubstance(self): '''Return this message substance''' return self.hdrtxt + self.payload def setSpamprob(self, prob): '''Score of the last spamprob calc, may not be persistent''' self.spamprob = prob def tokenize(self): '''Returns substance as tokens''' return tokenizer.tokenize(self.getSubstance()) def createTimeStamp(self): '''Returns the create time of this message''' # Should return a timestamp like time.time() raise NotImplementedError def getFrom(self): '''Return a message From header content''' if self.hdrtxt: match = re.search(r'^From:(.*)$', self.hdrtxt, re.MULTILINE) return match.group(1) else: return None def getSubject(self): '''Return a message Subject header contents''' if self.hdrtxt: match = re.search(r'^Subject:(.*)$', self.hdrtxt, re.MULTILINE) return match.group(1) else: return None def getDate(self): '''Return a message Date header contents''' if self.hdrtxt: match = re.search(r'^Date:(.*)$', self.hdrtxt, re.MULTILINE) return match.group(1) else: return None def getHeadersList(self): '''Return a list of message header tuples''' hdrregex = re.compile(r'^([A-Za-z0-9-_]*): ?(.*)$', re.MULTILINE) data = re.sub(r'\r?\n\r?\s',' ',self.hdrtxt,re.MULTILINE) match = hdrregex.findall(data) return match def getHeaders(self): '''Return message headers as text''' return self.hdrtxt def getPayload(self): '''Return the message body''' return self.payload def stripSBDHeader(self): '''Removes the X-Spambayes-Disposition: header from the message''' # This is useful for training, where a spammer may be spoofing # our header, to make sure that our header doesn't become an # overweight clue to hamminess raise NotImplementedError class MessageFactory: '''Abstract Message Factory''' def __init__(self): '''Constructor()''' pass def create(self, key): '''Create a message instance''' raise NotImplementedError if __name__ == '__main__': print >>sys.stderr, __doc__ --- NEW FILE: CostCounter.py --- from spambayes.Options import options class CostCounter: name = "Superclass Cost" def __init__(self): self.total = 0 def spam(self, scr): pass def ham(self, scr): pass def __str__(self): return "%s: $%.4f" % (self.name, self.total) class CompositeCostCounter: def __init__(self,cclist): self.clients = cclist def spam(self, scr): for c in self.clients: c.spam(scr) def ham(self, scr): for c in self.clients: c.ham(scr) def __str__(self): s = [] for c in self.clients: s.append(str(c)) return '\n'.join(s) class DelayedCostCounter(CompositeCostCounter): def __init__(self,cclist): CompositeCostCounter.__init__(self,cclist) self.spamscr=[] self.hamscr=[] def spam(self, scr): self.spamscr.append(scr) def ham(self, scr): self.hamscr.append(scr) def __str__(self): for scr in self.spamscr: CompositeCostCounter.spam(self,scr) for scr in self.hamscr: CompositeCostCounter.ham(self,scr) s=[] for line in CompositeCostCounter.__str__(self).split('\n'): s.append('Delayed-'+line) return '\n'.join(s) class CountCostCounter(CostCounter): def __init__(self): CostCounter.__init__(self) self._fp = 0 self._fn = 0 self._unsure = 0 self._unsureham = 0 self._unsurespam = 0 self._spam = 0 self._ham = 0 self._correctham = 0 self._correctspam = 0 self._total = 0 def spam(self, scr): self._total += 1 self._spam += 1 if scr < options.ham_cutoff: self._fn += 1 elif scr < options.spam_cutoff: self._unsure += 1 self._unsurespam += 1 else: self._correctspam += 1 def ham(self, scr): self._total += 1 self._ham += 1 if scr > options.spam_cutoff: self._fp += 1 elif scr > options.ham_cutoff: self._unsure += 1 self._unsureham += 1 else: self._correctham += 1 def __str__(self): return ("Total messages: %d; %d (%.1f%%) ham + %d (%.1f%%) spam\n"%( self._total, self._ham, zd(100.*self._ham,self._total), self._spam, zd(100.*self._spam,self._total))+ "Ham: %d (%.2f%%) ok, %d (%.2f%%) unsure, %d (%.2f%%) fp\n"%( self._correctham, zd(100.*self._correctham,self._ham), self._unsureham, zd(100.*self._unsureham,self._ham), self._fp, zd(100.*self._fp,self._ham))+ "Spam: %d (%.2f%%) ok, %d (%.2f%%) unsure, %d (%.2f%%) fn\n"%( self._correctspam, zd(100.*self._correctspam,self._spam), self._unsurespam, zd(100.*self._unsurespam,self._spam), self._fn, zd(100.*self._fn,self._spam))+ "Score False: %.2f%% Unsure %.2f%%"%( zd(100.*(self._fp+self._fn),self._total), zd(100.*self._unsure,self._total))) def zd(x,y): if y > 0: return x / y else: return 0 class StdCostCounter(CostCounter): name = "Standard Cost" def spam(self, scr): if scr < options.ham_cutoff: self.total += options.best_cutoff_fn_weight elif scr < options.spam_cutoff: self.total += options.best_cutoff_unsure_weight def ham(self, scr): if scr > options.spam_cutoff: self.total += options.best_cutoff_fp_weight elif scr > options.ham_cutoff: self.total += options.best_cutoff_unsure_weight class FlexCostCounter(CostCounter): name = "Flex Cost" def _lambda(self, scr): if scr < options.ham_cutoff: return 0 elif scr > options.spam_cutoff: return 1 else: return (scr - options.ham_cutoff) / ( options.spam_cutoff - options.ham_cutoff) def spam(self, scr): self.total += (1 - self._lambda(scr)) * options.best_cutoff_fn_weight def ham(self, scr): self.total += self._lambda(scr) * options.best_cutoff_fp_weight class Flex2CostCounter(FlexCostCounter): name = "Flex**2 Cost" def spam(self, scr): self.total += (1 - self._lambda(scr))**2 * options.best_cutoff_fn_weight def ham(self, scr): self.total += self._lambda(scr)**2 * options.best_cutoff_fp_weight def default(): return CompositeCostCounter([ CountCostCounter(), StdCostCounter(), FlexCostCounter(), Flex2CostCounter(), DelayedCostCounter([ CountCostCounter(), StdCostCounter(), FlexCostCounter(), Flex2CostCounter(), ]) ]) def nodelay(): return CompositeCostCounter([ CountCostCounter(), StdCostCounter(), FlexCostCounter(), Flex2CostCounter(), ]) if __name__=="__main__": cc=default() cc.ham(0) cc.spam(1) cc.ham(0.5) cc.spam(0.5) options.spam_cutoff=0.7 options.ham_cutoff=0.4 print cc --- NEW FILE: FileCorpus.py --- #! /usr/bin/env python """FileCorpus.py - Corpus composed of file system artifacts Classes: FileCorpus - an observable dictionary of FileMessages ExpiryFileCorpus - a FileCorpus of young files FileMessage - a subject of Spambayes training FileMessageFactory - a factory to create FileMessage objects GzipFileMessage - A FileMessage zipped for less storage GzipFileMessageFactory - factory to create GzipFileMessage objects Abstract: These classes are concrete implementations of the Corpus framework. FileCorpus is designed to manage corpora that are directories of message files. ExpiryFileCorpus is an ExpiryCorpus of file messages. FileMessage manages messages that are files in the file system. FileMessageFactory is responsible for the creation of FileMessages, in response to requests to a corpus for messages. GzipFileMessage and GzipFileMessageFactory are used to persist messages as zipped files. This can save a bit of persistent storage, though the ability of the compresser to do very much deflation is limited due to the relatively small size of the average textual message. Still, for a large corpus, this could amount to a significant space savings. See Corpus.__doc__ for more information. Test harness: FileCorpus [options] options: -h : show this message -v : execute in verbose mode, useful for general understanding and debugging purposes -g : use GzipFileMessage and GzipFileMessageFactory -s : setup self test, useful for seeing what is going into the test -t : setup and execute a self test. -c : clean up file system after self test Please note that running with -s or -t will create file system artifacts in the current directory. Be sure this doesn't stomp something of yours... The artifacts created are: fctestmisc.bayes fctestclass.bayes fctestspamcorpus/MSG00001 fctestspamcorpus/MSG00002 fctestunsurecorpus/MSG00003 fctestunsurecorpus/MSG00004 fctestunsurecorpus/MSG00005 fctestunsurecorpus/MSG00006 fctesthamcorpus/ After the test has executed, the following file system artifacts (should) will exist: fctestmisc.bayes fctestclass.bayes fctestspamcorpus/MSG00001 fctestspamcorpus/MSG00004 fctesthamcorpus/MSG00002 fctesthamcorpus/MSG00005 fctesthamcorpus/MSG00006 fctestunsurecorpus/ To Do: o Suggestions? """ # This module is part of the spambayes project, which is Copyright 2002 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Tim Stone " __credits__ = "Richie Hindle, Tim Peters, all the spambayes contributors." from __future__ import generators from spambayes import Corpus from spambayes import storage import sys, os, gzip, fnmatch, getopt, errno, time, stat from spambayes.Options import options class FileCorpus(Corpus.Corpus): def __init__(self, factory, directory, filter='*', cacheSize=250): '''Constructor(FileMessageFactory, corpus directory name, fnmatch filter''' Corpus.Corpus.__init__(self, factory, cacheSize) self.directory = directory self.filter = filter # This assumes that the directory exists. A horrible death occurs # otherwise. We *could* simply create it, but that will likely only # mask errors # This will not pick up any changes to the corpus that are made # through the file system. The key list is established in __init__, # and if anybody stores files in the directory, even if they match # the filter, they won't make it into the key list. The same # problem exists if anybody removes files. This *could* be a problem. # If so, we can maybe override the keys() method to account for this, # but there would be training side-effects... The short of it is that # corpora that are managed by FileCorpus should *only* be managed by # FileCorpus (at least for now). External changes that must be made # to the corpus should for the moment be handled by a complete # retraining. for filename in os.listdir(directory): if fnmatch.fnmatch(filename, filter): self.msgs[filename] = None def makeMessage(self, key): '''Ask our factory to make a Message''' msg = self.factory.create(key, self.directory) return msg def addMessage(self, message): '''Add a Message to this corpus''' if not fnmatch.fnmatch(message.key(), self.filter): raise ValueError if options.verbose: print 'adding',message.key(),'to corpus' message.directory = self.directory message.store() # superclass processing *MUST* be done # perform superclass processing *LAST!* Corpus.Corpus.addMessage(self, message) def removeMessage(self, message): '''Remove a Message from this corpus''' if options.verbose: print 'removing',message.key(),'from corpus' message.remove() # superclass processing *MUST* be done # perform superclass processing *LAST!* Corpus.Corpus.removeMessage(self, message) def __repr__(self): '''Instance as a representative string''' nummsgs = len(self.msgs) if nummsgs != 1: s = 's' else: s = '' if options.verbose and nummsgs > 0: lst = ', ' + '%s' % (self.keys()) else: lst = '' return "<%s object at %8.8x, directory: %s, %s message%s%s>" % \ (self.__class__.__name__, \ id(self), \ self.directory, \ nummsgs, s, lst) class ExpiryFileCorpus(Corpus.ExpiryCorpus, FileCorpus): '''FileCorpus of "young" file system artifacts''' def __init__(self, expireBefore, factory, directory, filter='*', cacheSize=250): '''Constructor(FileMessageFactory, corpus directory name, fnmatch filter''' Corpus.ExpiryCorpus.__init__(self, expireBefore) FileCorpus.__init__(self, factory, directory, filter, cacheSize) class FileMessage(Corpus.Message): '''Message that persists as a file system artifact.''' def __init__(self,file_name, directory): '''Constructor(message file name, corpus directory name)''' Corpus.Message.__init__(self) self.file_name = file_name self.directory = directory # No calling of self.load() here - that's done on demand by # Message.__getattr__. def pathname(self): '''Derive the pathname of the message file''' return os.path.join(self.directory, self.file_name) def load(self): '''Read the Message substance from the file''' if options.verbose: print 'loading', self.file_name pn = self.pathname() try: fp = open(pn, 'rb') except IOError, e: if e.errno != errno.ENOENT: raise else: self.setSubstance(fp.read()) fp.close() def store(self): '''Write the Message substance to the file''' if options.verbose: print 'storing', self.file_name pn = self.pathname() fp = open(pn, 'wb') fp.write(self.getSubstance()) fp.close() def remove(self): '''Message hara-kiri''' if options.verbose: print 'physically deleting file',self.pathname() os.unlink(self.pathname()) def name(self): '''A unique name for the message''' return self.file_name def key(self): '''The key of this message in the msgs dictionary''' return self.file_name def __repr__(self): '''Instance as a representative string''' elip = '' sub = self.getSubstance() if options.verbose: sub = self.getSubstance() else: if len(sub) > 20: sub = sub[:20] if len(sub) > 40: sub += '...' + sub[-20:] pn = os.path.join(self.directory, self.file_name) return "<%s object at %8.8x, file: %s, %s>" % \ (self.__class__.__name__, \ id(self), pn, sub) def __str__(self): '''Instance as a printable string''' return self.__repr__() def createTimestamp(self): '''Return the create timestamp for the file''' stats = os.stat(self.pathname()) ctime = stats[stat.ST_CTIME] return ctime class FileMessageFactory(Corpus.MessageFactory): '''MessageFactory for FileMessage objects''' def create(self, key, directory): '''Create a message object from a filename in a directory''' return FileMessage(key, directory) class GzipFileMessage(FileMessage): '''Message that persists as a zipped file system artifact.''' def load(self): '''Read the Message substance from the file''' if options.verbose: print 'loading', self.file_name pn = self.pathname() try: fp = gzip.open(pn, 'rb') except IOError, e: if e.errno != errno.ENOENT: raise else: self.setSubstance(fp.read()) fp.close() def store(self): '''Write the Message substance to the file''' if options.verbose: print 'storing', self.file_name pn = self.pathname() gz = gzip.open(pn, 'wb') gz.write(self.getSubstance()) gz.flush() gz.close() class GzipFileMessageFactory(FileMessageFactory): '''MessageFactory for FileMessage objects''' def create(self, key, directory): '''Create a message object from a filename in a directory''' return GzipFileMessage(key, directory) def runTest(useGzip): print 'Executing Test' if useGzip: fmFact = GzipFileMessageFactory() print 'Executing with Gzipped files' else: fmFact = FileMessageFactory() print 'Executing with uncompressed files' print '\n\nCreating two Classifier databases' miscbayes = storage.PickledClassifier('fctestmisc.bayes') classbayes = storage.DBDictClassifier('fctestclass.bayes') print '\n\nSetting up spam corpus' spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus') spamtrainer = storage.SpamTrainer(miscbayes) spamcorpus.addObserver(spamtrainer) anotherspamtrainer = storage.SpamTrainer(classbayes, storage.UPDATEPROBS) spamcorpus.addObserver(anotherspamtrainer) keys = spamcorpus.keys() keys.sort() for key in keys: # iterate the list of keys msg = spamcorpus[key] # corpus is a dictionary spamtrainer.train(msg) anotherspamtrainer.train(msg) print '\n\nSetting up ham corpus' hamcorpus = FileCorpus(fmFact, \ 'fctesthamcorpus', \ 'MSG*') hamtrainer = storage.HamTrainer(miscbayes) hamcorpus.addObserver(hamtrainer) hamtrainer.trainAll(hamcorpus) print '\n\nA couple of message related tests' if useGzip: fmClass = GzipFileMessage else: fmClass = FileMessage m1 = fmClass('XMG00001', 'fctestspamcorpus') m1.setSubstance(testmsg2()) print '\n\nAdd a message to hamcorpus that does not match the filter' try: hamcorpus.addMessage(m1) except ValueError: print 'Add failed, test passed' else: print 'Add passed, test failed' print '\n\nThis is the hamcorpus' print hamcorpus print '\n\nThis is the spamcorpus' print spamcorpus print '\n\nSetting up unsure corpus' # the unsure corpus is an expiry corpus with five second expiry # and a cache size of 2 (for testing purposes only...), and # no trainers, since there's no such thing as 'unsure training' unsurecorpus = ExpiryFileCorpus(5, fmFact, \ 'fctestunsurecorpus', 'MSG*', 2) unsurecorpus.removeExpiredMessages() print '\n\nIterate the unsure corpus twice, to make sure cache size \ is managed correctly, and to make sure iteration is repeatable. \ We should not see MSG00003 in this iteration.' for msg in unsurecorpus: print msg.key() # don't print msg, too much information print '...and again' for msg in unsurecorpus: print msg.key() # don't print msg, too much information print '\n\nRemoving expired messages from unsure corpus.' unsurecorpus.removeExpiredMessages() print '\n\nTrain with an individual message' anotherhamtrainer = storage.HamTrainer(classbayes) anotherhamtrainer.train(unsurecorpus['MSG00005']) print '\n\nMoving msg00002 from spamcorpus to hamcorpus' hamcorpus.takeMessage('MSG00002', spamcorpus) # Oops. made a mistake... print "\n\nLet's test printing a message" msg = spamcorpus['MSG00001'] print msg print '\n\nThis is some vital information in the message' print 'Date header is',msg.getDate() print 'Subject header is',msg.getSubject() print 'From header is',msg.getFrom() print 'Header text is:',msg.getHeaders() print 'Headers are:',msg.getHeadersList() print 'Body is:',msg.getPayload() print '\n\nClassifying messages in unsure corpus' for msg in unsurecorpus: prob = classbayes.spamprob(msg.tokenize()) print 'Message %s spam probability is %f' % (msg.key(), prob) if prob < options.ham_cutoff: print 'Moving %s from unsurecorpus to hamcorpus, \ based on prob of %f' % (msg.key(), prob) hamcorpus.takeMessage(msg.key(), unsurecorpus) elif prob > options.spam_cutoff: print 'Moving %s from unsurecorpus to spamcorpus, \ based on prob of %f' % (msg.key(), prob) spamcorpus.takeMessage(msg.key(), unsurecorpus) print '\n\nThis is the new hamcorpus' print hamcorpus print '\n\nThis is the new spamcorpus' print spamcorpus print '\n\nThis is the new unsurecorpus' print unsurecorpus print 'unsurecorpus cache contains', unsurecorpus.keysInMemory print 'unsurecorpus msgs dict contains', unsurecorpus.msgs print '\n\nStoring bayes databases' miscbayes.store() classbayes.store() def cleanupTest(): print 'Cleaning up' cleanupDirectory('fctestspamcorpus') cleanupDirectory('fctesthamcorpus') cleanupDirectory('fctestunsurecorpus') if not useExistingDB: try: os.unlink('fctestmisc.bayes') except OSError, e: if e.errno != 2: # errno. raise try: os.unlink('fctestclass.bayes') except OSError, e: if e.errno != 2: # errno. raise def cleanupDirectory(dirname): try: flist = os.listdir(dirname) except OSError, e: if e.errno != 3: # errno. raise else: for filename in os.listdir(dirname): fn = os.path.join(dirname, filename) os.unlink(fn) try: os.rmdir(dirname) except OSError, e: if e.errno != 2: # errno. raise def setupTest(useGzip): cleanupTest() print 'Setting up test' # no try blocks here, because if any of this dies, the test # cannot proceed os.mkdir('fctestspamcorpus') os.mkdir('fctesthamcorpus') os.mkdir('fctestunsurecorpus') tm1 = testmsg1() tm2 = testmsg2() if useGzip: fmClass = GzipFileMessage else: fmClass = FileMessage m1 = fmClass('MSG00001', 'fctestspamcorpus') m1.setSubstance(tm1) m1.store() m2 = fmClass('MSG00002', 'fctestspamcorpus') m2.setSubstance(tm2) m2.store() m3 = fmClass('MSG00003', 'fctestunsurecorpus') m3.setSubstance(tm1) m3.store() for x in range(11): time.sleep(1) # make sure MSG00003 has expired if 10-x == 1: s = '' else: s = 's' print 'wait',10-x,'more second%s' % (s) m4 = fmClass('MSG00004', 'fctestunsurecorpus') m4.setSubstance(tm1) m4.store() m5 = fmClass('MSG00005', 'fctestunsurecorpus') m5.setSubstance(tm2) m5.store() m6 = fmClass('MSG00006', 'fctestunsurecorpus') m6.setSubstance(tm2) m6.store() def testmsg1(): return """ X-Hd:skip@pobox.com Mon Nov 04 10:50:49 2002 Received:by mail.powweb.com (mbox timstone) (with Cubic Circle's cucipop (v1.31 1998/05/13) Mon Nov 4 08:50:58 2002) X-From_:skip@mojam.com Mon Nov 4 08:49:03 2002 Return-Path: Delivered-To:timstone@mail.powweb.com Received:from manatee.mojam.com (manatee.mojam.com [199.249.165.175]) by mail.powweb.com (Postfix) with ESMTP id DC95A1BB1D0 for ; Mon, 4 Nov 2002 08:49:02 -0800 (PST) Received:from montanaro.dyndns.org (12-248-11-90.client.attbi.com [12.248.11.90]) by manatee.mojam.com (8.12.1/8.12.1) with ESMTP id gA4Gn0oY029655 for ; Mon, 4 Nov 2002 10:49:00 -0600 Received:from montanaro.dyndns.org (localhost [127.0.0.1]) by montanaro.dyndns.org (8.12.2/8.12.2) with ESMTP id gA4Gn3cP015572 for ; Mon, 4 Nov 2002 10:49:03 -0600 (CST) Received:(from skip@localhost) by montanaro.dyndns.org (8.12.2/8.12.2/Submit) id gA4Gn37l015569; Mon, 4 Nov 2002 10:49:03 -0600 (CST) From:Skip Montanaro MIME-Version:1.0 Content-Type:text/plain; charset=us-ascii Content- Transfer- Encoding:7bit Message-ID:<15814.42238.882013.702030@montanaro.dyndns.org> Date:Mon, 4 Nov 2002 10:49:02 -0600 To:Four Stones Expressions Subject:Reformat mail to 80 columns? In-Reply-To: References:<8285NLPL5YTTQJGXTAXU3WA8OB2.3dc5e3cc@riven> X-Mailer:VM 7.07 under 21.5 (beta9) "brussels sprouts" XEmacs Lucid Reply-To:skip@pobox.com X-Hammie- Disposition:Unsure 11/4/2002 10:49:02 AM, Skip Montanaro wrote: >(off-list) > >Tim, > >Any chance you can easily generate messages to the spambayes list which wrap >at something between 70 and 78 columns? I find I have to always edit your >messages to read them easily. > >Thanks, > >-- >Skip Montanaro - skip@pobox.com >http://www.mojam.com/ >http://www.musi-cal.com/ > > - Tim www.fourstonesExpressions.com """ def testmsg2(): return """ X-Hd:richie@entrian.com Wed Nov 06 12:05:41 2002 Received:by mail.powweb.com (mbox timstone) (with Cubic Circle's cucipop (v1.31 1998/05/13) Wed Nov 6 10:05:45 2002) X-From_:richie@entrian.com Wed Nov 6 10:05:33 2002 Return-Path: Delivered-To:timstone@mail.powweb.com Received:from anchor-post-31.mail.demon.net (anchor-post-31.mail.demon.net [194.217.242.89]) by mail.powweb.com (Postfix) with ESMTP id 3DC431BB06A for ; Wed, 6 Nov 2002 10:05:33 -0800 (PST) Received:from sundog.demon.co.uk ([158.152.226.183]) by anchor-post-31.mail.demon.net with smtp (Exim 3.35 #1) id 189UYP-000IAw-0V for tim@fourstonesExpressions.com; Wed, 06 Nov 2002 18:05:25 +0000 From:Richie Hindle To:tim@fourstonesExpressions.com Subject:Re: What to call this training stuff Date:Wed, 06 Nov 2002 18:05:56 +0000 Organization:entrian.com Reply-To:richie@entrian.com Message-ID: References: In-Reply-To: X-Mailer:Forte Agent 1.7/32.534 MIME-Version:1.0 Content-Type:text/plain; charset=us-ascii Content- Transfer- Encoding:7bit X-Hammie- Disposition:Unsure Hi Tim, > Richie, I think we should package these classes I've been writing as > 'corpusManagement.py' What we're really doing here is creating a set of tools > that can be used to manage corpi (?) corpusses (?) corpae (?) whatever... of > messages. Good plan. Minor point of style: mixed-case module names (like class names) tend to have an initial capital: CorpusManagement.py On the name... sorry to disagree about names again, but what does the word 'management' add? This is a module for manipulating corpuses, so I reckon it should be called Corpus. Like Cookie, gzip, zipfile, locale, mailbox... see what I mean? -- Richie Hindle richie@entrian.com""" if __name__ == '__main__': try: opts, args = getopt.getopt(sys.argv[1:], 'estgvhcu') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() options.verbose = False runTestServer = False setupTestServer = False cleanupTestServer = False useGzip = False useExistingDB = False for opt, arg in opts: if opt == '-h': print >>sys.stderr, __doc__ sys.exit() elif opt == '-s': setupTestServer = True elif opt == '-e': runTestServer = True elif opt == '-t': setupTestServer = True runTestServer = True elif opt == '-c': cleanupTestServer = True elif opt == '-v': options.verbose = True elif opt == '-g': useGzip = True elif opt == '-u': useExistingDB = True elif opt == '-v': options.verbose = True if setupTestServer: setupTest(useGzip) if runTestServer: runTest(useGzip) elif cleanupTestServer: cleanupTest() else: print >>sys.stderr, __doc__ --- NEW FILE: Histogram.py --- #! /usr/bin/env python import math from spambayes.Options import options try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 class Hist: """Simple histograms of float values.""" # Pass None for lo and hi and it will automatically adjust to the min # and max values seen. # Note: nbuckets can be passed for backward compatibility. The # display() method can be passed a different nbuckets value. def __init__(self, nbuckets=options.nbuckets, lo=0.0, hi=100.0): self.lo, self.hi = lo, hi self.nbuckets = nbuckets self.buckets = [0] * nbuckets self.data = [] # the raw data points self.stats_uptodate = False # Add a value to the collection. def add(self, x): self.data.append(x) self.stats_uptodate = False # Compute, and set as instance attrs: # n # of data points # The rest are set iff n>0: # min smallest value in collection # max largest value in collection # median midpoint # mean # pct list of (percentile, score) pairs # var variance # sdev population standard deviation (sqrt(variance)) # self.data is also sorted. def compute_stats(self): if self.stats_uptodate: return self.stats_uptodate = True data = self.data n = self.n = len(data) if n == 0: return data.sort() self.min = data[0] self.max = data[-1] if n & 1: self.median = data[n // 2] else: self.median = (data[n // 2] + data[(n-1) // 2]) / 2.0 # Compute mean. # Add in increasing order of magnitude, to minimize roundoff error. if data[0] < 0.0: temp = [(abs(x), x) for x in data] temp.sort() data = [x[1] for x in temp] del temp sum = 0.0 for x in data: sum += x mean = self.mean = sum / n # Compute variance. var = 0.0 for x in data: d = x - mean var += d*d self.var = var / n self.sdev = math.sqrt(self.var) # Compute percentiles. self.pct = pct = [] for p in options.percentiles: assert 0.0 <= p <= 100.0 # In going from data index 0 to index n-1, we move n-1 times. # p% of that is (n-1)*p/100. i = (n-1)*p/1e2 if i < 0: # Just return the smallest. score = data[0] else: whole = int(i) frac = i - whole score = data[whole] if whole < n-1 and frac: # Move frac of the way from this score to the next. score += frac * (data[whole + 1] - score) pct.append((p, score)) # Merge other into self. def __iadd__(self, other): self.data.extend(other.data) self.stats_uptodate = False return self def get_lo_hi(self): self.compute_stats() lo, hi = self.lo, self.hi if lo is None: lo = self.min if hi is None: hi = self.max return lo, hi def get_bucketwidth(self): lo, hi = self.get_lo_hi() span = float(hi - lo) return span / self.nbuckets # Set instance var nbuckets to the # of buckets, and buckets to a list # of nbuckets counts. def fill_buckets(self, nbuckets=None): if nbuckets is None: nbuckets = self.nbuckets if nbuckets <= 0: raise ValueError("nbuckets %g > 0 required" % nbuckets) self.nbuckets = nbuckets self.buckets = buckets = [0] * nbuckets # Compute bucket counts. lo, hi = self.get_lo_hi() bucketwidth = self.get_bucketwidth() for x in self.data: i = int((x - lo) / bucketwidth) if i >= nbuckets: i = nbuckets - 1 elif i < 0: i = 0 buckets[i] += 1 # Print a histogram to stdout. # Also sets instance var nbuckets to the # of buckets, and # buckts to a list of nbuckets counts, but only if at least one # data point is in the collection. def display(self, nbuckets=None, WIDTH=61): if nbuckets is None: nbuckets = self.nbuckets if nbuckets <= 0: raise ValueError("nbuckets %g > 0 required" % nbuckets) self.compute_stats() n = self.n if n == 0: return print "%d items; mean %.2f; sdev %.2f" % (n, self.mean, self.sdev) print "-> min %g; median %g; max %g" % (self.min, self.median, self.max) pcts = ['%g%% %g' % x for x in self.pct] print "-> percentiles:", '; '.join(pcts) lo, hi = self.get_lo_hi() if lo > hi: return # hunit is how many items a * represents. A * is printed for # each hunit items, plus any non-zero fraction thereof. self.fill_buckets(nbuckets) biggest = max(self.buckets) hunit, r = divmod(biggest, WIDTH) if r: hunit += 1 print "* =", hunit, "items" # We need ndigits decimal digits to display the largest bucket count. ndigits = len(str(biggest)) # Displaying the bucket boundaries is more troublesome. bucketwidth = self.get_bucketwidth() whole_digits = max(len(str(int(lo))), len(str(int(hi - bucketwidth)))) frac_digits = 0 while bucketwidth < 1.0: # Incrementing by bucketwidth may not change the last displayed # digit, so display one more. frac_digits += 1 bucketwidth *= 10.0 format = ("%" + str(whole_digits + 1 + frac_digits) + '.' + str(frac_digits) + 'f %' + str(ndigits) + "d") bucketwidth = self.get_bucketwidth() for i in range(nbuckets): n = self.buckets[i] print format % (lo + i * bucketwidth, n), print '*' * ((n + hunit - 1) // hunit) --- NEW FILE: Options.py --- # Options.options is a globally shared options object. # XXX As this code is, option names must be unique across ini sections, # XXX and must not conflict with OptionsClass method names. import sys, os import StringIO import ConfigParser from sets import Set try: True, False, bool except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 def bool(val): return not not val __all__ = ['options'] defaults = """ [Tokenizer] # If true, tokenizer.Tokenizer.tokenize_headers() will tokenize the # contents of each header field just like the text of the message # body, using the name of the header as a tag. Tokens look like # "header:word". The basic approach is simple and effective, but also # very sensitive to biases in the ham and spam collections. For # example, if the ham and spam were collected at different times, # several headers with date/time information will become the best # discriminators. (Not just Date, but Received and X-From_.) basic_header_tokenize: False # If true and basic_header_tokenize is also true, then # basic_header_tokenize is the only action performed. basic_header_tokenize_only: False # If basic_header_tokenize is true, then basic_header_skip is a set of # headers that should be skipped. basic_header_skip: received date x-.* # If true, the first few characters of application/octet-stream sections # are used, undecoded. What 'few' means is decided by octet_prefix_size. check_octets: False octet_prefix_size: 5 # Generate tokens just counting the number of instances of each kind of # header line, in a case-sensitive way. # # Depending on data collection, some headers are not safe to count. # For example, if ham is collected from a mailing list but spam from your # regular inbox traffic, the presence of a header like List-Info will be a # very strong ham clue, but a bogus one. In that case, set # count_all_header_lines to False, and adjust safe_headers instead. count_all_header_lines: False # When True, generate a "noheader:HEADERNAME" token for each header in # safe_headers (below) that *doesn't* appear in the headers. This helped # in various of Tim's python.org tests, but appeared to hurt a little in # Anthony Baxter's tests. record_header_absence: False # Like count_all_header_lines, but restricted to headers in this list. # safe_headers is ignored when count_all_header_lines is true, unless # record_header_absence is also true. safe_headers: abuse-reports-to date errors-to from importance in-reply-to message-id mime-version organization received reply-to return-path subject to user-agent x-abuse-info x-complaints-to x-face # A lot of clues can be gotten from IP addresses and names in Received: # headers. Again this can give spectacular results for bogus reasons # if your test corpora are from different sources. Else set this to true. mine_received_headers: False # Mine the following address headers. If you have mixed source corpuses # (as opposed to a mixed sauce walrus, which is delicious!) then you # probably don't want to use 'to' or 'cc') # Address headers will be decoded, and will generate charset tokens as # well as the real address. # others to consider: to, cc, reply-to, errors-to, sender, ... address_headers: from # If legitimate mail contains things that look like text to the tokenizer # and turning turning off this option helps (perhaps binary attachments get # 'defanged' by something upstream from this operation and thus look like # text), this may help, and should be an alert that perhaps the tokenizer is # broken. generate_long_skips: True # Try to capitalize on mail sent to multiple similar addresses. summarize_email_prefixes: False # # Length of words that triggers 'long skips'. Longer than this # triggers a skip. # skip_max_word_size: 12 # Generate tokens which resemble the posting time in 10-minute buckets: # 'time:' hour ':' minute//10 generate_time_buckets: False # Extract day of the week tokens from the Date: header. extract_dow: False # If true, replace high-bit characters (ord(c) >= 128) and control characters # with question marks. This allows non-ASCII character strings to be # identified with little training and small database burden. It's appropriate # only if your ham is plain 7-bit ASCII, or nearly so, so that the mere # presence of non-ASCII character strings is known in advance to be a strong # spam indicator. replace_nonascii_chars: False [TestDriver] # These control various displays in class TestDriver.Driver, and Tester.Test. # spam_cutoff and ham_cutoff are used in Python slice sense: # A msg is considered ham if its score is in 0:ham_cutoff # A msg is considered unsure if its score is in ham_cutoff:spam_cutoff # A msg is considered spam if its score is in spam_cutoff: # # So it's unsure iff ham_cutoff <= score < spam_cutoff. # For a binary classifier, make ham_cutoff == spam_cutoff. # ham_cutoff > spam_cutoff doesn't make sense. # # The defaults here (.2 and .9) may be appropriate for the default chi- # combining scheme. Cutoffs for chi-combining typically aren't touchy, # provided you're willing to settle for "really good" instead of "optimal". # Tim found that .3 and .8 worked very well for well-trained systems on # his personal email, and his large comp.lang.python test. If just beginning # training, or extremely fearful of mistakes, 0.05 and 0.95 may be more # appropriate for you. # # Picking good values for gary-combining is much harder, and appears to be # corpus-dependent, and within a single corpus dependent on how much # training has been done. Values from 0.50 thru the low 0.60's have been # reported to work best by various testers on their data. ham_cutoff: 0.20 spam_cutoff: 0.90 # Number of buckets in histograms. nbuckets: 200 show_histograms: True # After the display of a ham+spam histogram pair, you can get a listing of # all the cutoff values (coinciding with histogram bucket boundaries) that # minimize # # best_cutoff_fp_weight * (# false positives) + # best_cutoff_fn_weight * (# false negatives) + # best_cutoff_unsure_weight * (# unsure msgs) # # This displays two cutoffs: hamc and spamc, where # # 0.0 <= hamc <= spamc <= 1.0 # # The idea is that if something scores < hamc, it's called ham; if # something scores >= spamc, it's called spam; and everything else is # called 'I am not sure' -- the middle ground. # # Note: You may wish to increase nbuckets, to give this scheme more cutoff # values to analyze. compute_best_cutoffs_from_histograms: True best_cutoff_fp_weight: 10.00 best_cutoff_fn_weight: 1.00 best_cutoff_unsure_weight: 0.20 # Histogram analysis also displays percentiles. For each percentile p # in the list, the score S such that p% of all scores are <= S is given. # Note that percentile 50 is the median, and is displayed (along with the # min score and max score) independent of this option. percentiles: 5 25 75 95 # Display spam when # show_spam_lo <= spamprob <= show_spam_hi # and likewise for ham. The defaults here do not show anything. show_spam_lo: 1.0 show_spam_hi: 0.0 show_ham_lo: 1.0 show_ham_hi: 0.0 show_false_positives: True show_false_negatives: False show_unsure: False # The maximum # of characters to display for a msg displayed due to the # show_xyz options above. show_charlimit: 3000 # If save_trained_pickles is true, Driver.train() saves a binary pickle # of the classifier after training. The file basename is given by # pickle_basename, the extension is .pik, and increasing integers are # appended to pickle_basename. By default (if save_trained_pickles is # true), the filenames are class1.pik, class2.pik, ... If a file of that # name already exists, it is overwritten. pickle_basename is ignored when # save_trained_pickles is false. # if save_histogram_pickles is true, Driver.train() saves a binary # pickle of the spam and ham histogram for "all test runs". The file # basename is given by pickle_basename, the suffix _spamhist.pik # or _hamhist.pik is appended to the basename. save_trained_pickles: False pickle_basename: class save_histogram_pickles: False # default locations for timcv and timtest - these get the set number # interpolated. spam_directories: Data/Spam/Set%d ham_directories: Data/Ham/Set%d [CV Driver] # A cross-validation driver takes N ham+spam sets, and builds N classifiers, # training each on N-1 sets, and the predicting against the set not trained # on. By default, it does this in a clever way, learning *and* unlearning # sets as it goes along, so that it never needs to train on N-1 sets in one # gulp after the first time. Setting this option true forces ''one gulp # from-scratch'' training every time. There used to be a set of combining # schemes that needed this, but now it is just in case you are paranoid . build_each_classifier_from_scratch: False [Classifier] # The maximum number of extreme words to look at in a msg, where "extreme" # means with spamprob farthest away from 0.5. 150 appears to work well # across all corpora tested. max_discriminators: 150 # These two control the prior assumption about word probabilities. # unknown_word_prob is essentially the probability given to a word that # has never been seen before. Nobody has reported an improvement via moving # it away from 1/2, although Tim has measured a mean spamprob of a bit over # 0.5 (0.51-0.55) in 3 well-trained classifiers. # # unknown_word_strength adjusts how much weight to give the prior assumption # relative to the probabilities estimated by counting. At 0, the counting # estimates are believed 100%, even to the extent of assigning certainty # (0 or 1) to a word that has appeared in only ham or only spam. This # is a disaster. # # As unknown_word_strength tends toward infintity, all probabilities tend # toward unknown_word_prob. All reports were that a value near 0.4 worked # best, so this does not seem to be corpus-dependent. unknown_word_prob: 0.5 unknown_word_strength: 0.45 # When scoring a message, ignore all words with # abs(word.spamprob - 0.5) < minimum_prob_strength. # This may be a hack, but it has proved to reduce error rates in many # tests. 0.1 appeared to work well across all corpora. minimum_prob_strength: 0.1 # The combining scheme currently detailed on the Robinon web page. # The middle ground here is touchy, varying across corpus, and within # a corpus across amounts of training data. It almost never gives extreme # scores (near 0.0 or 1.0), but the tail ends of the ham and spam # distributions overlap. use_gary_combining: False # For vectors of random, uniformly distributed probabilities, -2*sum(ln(p_i)) # follows the chi-squared distribution with 2*n degrees of freedom. This is # the "provably most-sensitive" test the original scheme was monotonic # with. Getting closer to the theoretical basis appears to give an excellent # combining method, usually very extreme in its judgment, yet finding a tiny # (in # of msgs, spread across a huge range of scores) middle ground where # lots of the mistakes live. This is the best method so far. # One systematic benefit is is immunity to "cancellation disease". One # systematic drawback is sensitivity to *any* deviation from a # uniform distribution, regardless of whether actually evidence of # ham or spam. Rob Hooft alleviated that by combining the final S and H # measures via (S-H+1)/2 instead of via S/(S+H)). # In practice, it appears that setting ham_cutoff=0.05, and spam_cutoff=0.95, # does well across test sets; while these cutoffs are rarely optimal, they # get close to optimal. With more training data, Tim has had good luck # with ham_cutoff=0.30 and spam_cutoff=0.80 across three test data sets # (original c.l.p data, his own email, and newer general python.org traffic). use_chi_squared_combining: True # If the # of ham and spam in training data are out of balance, the # spamprob guesses can get stronger in the direction of the category with # more training msgs. In one sense this must be so, since the more data # we have of one flavor, the more we know about that flavor. But that # allows the accidental appearance of a strong word of that flavor in a msg # of the other flavor much more power than an accident in the other # direction. Enable experimental_ham_spam_imbalance_adjustment if you have # more ham than spam training data (or more spam than ham), and the # Bayesian probability adjustment won't 'believe' raw counts more than # min(# ham trained on, # spam trained on) justifies. I *expect* this # option will go away (and become the default), but people *with* strong # imbalance need to test it first. experimental_ham_spam_imbalance_adjustment: False [Hammie] # The name of the header that hammie adds to an E-mail in filter mode # It contains the "classification" of the mail, plus the score. hammie_header_name: X-Spambayes-Classification # The three disposition names are added to the header as the following # Three words: header_spam_string: spam header_ham_string: ham header_unsure_string: unsure # Accuracy of the score in the header in decimal digits header_score_digits: 2 # Set this to "True", to augment scores of 1.00 or 0.00 by a logarithmic # "one-ness" or "zero-ness" score (basically it shows the "number of zeros" # or "number of nines" next to the score value). header_score_logarithm: False # Enable debugging information in the header. hammie_debug_header: False # Name of a debugging header for spambayes hackers, showing the strongest # clues that have resulted in the classification in the standard header. hammie_debug_header_name: X-Hammie-Debug # The range of clues that are added to the "debug" header in the E-mail # All clues that have their probability smaller than this number, or larger # than one minus this number are added to the header such that you can see # why spambayes thinks this is ham/spam or why it is unsure. The default is # to show all clues, but you can reduce that by setting showclue to a lower # value, such as 0.1 clue_mailheader_cutoff: 0.5 [hammiefilter] # hammiefilter can use either a database (quick to score one message) or # a pickle (quick to train on huge amounts of messages). Set this to # True to use a database by default. hammiefilter_persistent_use_database: True hammiefilter_persistent_storage_file: ~/.hammiedb [pop3proxy] # pop3proxy settings - pop3proxy also respects the options in the Hammie # section, with the exception of the extra header details at the moment. # The only mandatory option is pop3proxy_servers, eg. "pop3.my-isp.com:110", # or a comma-separated list of those. The ":110" is optional. If you # specify more than one server in pop3proxy_servers, you must specify the # same number of ports in pop3proxy_ports. pop3proxy_servers: pop3proxy_ports: pop3proxy_cache_use_gzip: False pop3proxy_cache_expiry_days: 7 pop3proxy_spam_cache: pop3proxy-spam-cache pop3proxy_ham_cache: pop3proxy-ham-cache pop3proxy_unknown_cache: pop3proxy-unknown-cache pop3proxy_persistent_use_database: False pop3proxy_persistent_storage_file: hammie.db # Deprecated - use pop3proxy_servers and pop3proxy_ports instead. pop3proxy_server_name: pop3proxy_server_port: 110 pop3proxy_port: 110 [html_ui] html_ui_port: 8880 html_ui_launch_browser: False [globals] verbose: False # What DBM storage type should we use? Must be best, db3hash, dbhash, # gdbm, dumbdbm. Windows folk should steer clear of dbhash. Default is # "best", which will pick the best DBM type available on your platform. dbm_type: best """ int_cracker = ('getint', None) float_cracker = ('getfloat', None) boolean_cracker = ('getboolean', bool) string_cracker = ('get', None) all_options = { 'Tokenizer': {'safe_headers': ('get', lambda s: Set(s.split())), 'address_headers': ('get', lambda s: Set(s.split())), 'count_all_header_lines': boolean_cracker, 'record_header_absence': boolean_cracker, 'generate_long_skips': boolean_cracker, 'summarize_email_prefixes': boolean_cracker, 'skip_max_word_size': int_cracker, 'extract_dow': boolean_cracker, 'generate_time_buckets': boolean_cracker, 'mine_received_headers': boolean_cracker, 'check_octets': boolean_cracker, 'octet_prefix_size': int_cracker, 'basic_header_tokenize': boolean_cracker, 'basic_header_tokenize_only': boolean_cracker, 'basic_header_skip': ('get', lambda s: Set(s.split())), 'replace_nonascii_chars': boolean_cracker, }, 'TestDriver': {'nbuckets': int_cracker, 'show_ham_lo': float_cracker, 'show_ham_hi': float_cracker, 'show_spam_lo': float_cracker, 'show_spam_hi': float_cracker, 'show_false_positives': boolean_cracker, 'show_false_negatives': boolean_cracker, 'show_unsure': boolean_cracker, 'show_histograms': boolean_cracker, 'percentiles': ('get', lambda s: map(float, s.split())), 'save_trained_pickles': boolean_cracker, 'save_histogram_pickles': boolean_cracker, 'pickle_basename': string_cracker, 'show_charlimit': int_cracker, 'ham_cutoff': float_cracker, 'spam_cutoff': float_cracker, 'spam_directories': string_cracker, 'ham_directories': string_cracker, 'compute_best_cutoffs_from_histograms': boolean_cracker, 'best_cutoff_fp_weight': float_cracker, 'best_cutoff_fn_weight': float_cracker, 'best_cutoff_unsure_weight': float_cracker, }, 'CV Driver': {'build_each_classifier_from_scratch': boolean_cracker, }, 'Classifier': {'max_discriminators': int_cracker, 'unknown_word_prob': float_cracker, 'unknown_word_strength': float_cracker, 'minimum_prob_strength': float_cracker, 'use_gary_combining': boolean_cracker, 'use_chi_squared_combining': boolean_cracker, 'experimental_ham_spam_imbalance_adjustment': boolean_cracker, }, 'Hammie': {'hammie_header_name': string_cracker, 'clue_mailheader_cutoff': float_cracker, 'persistent_use_database': boolean_cracker, 'header_spam_string': string_cracker, 'header_unsure_string': string_cracker, 'header_ham_string': string_cracker, 'header_score_digits': int_cracker, 'header_score_logarithm': boolean_cracker, 'hammie_debug_header': boolean_cracker, 'hammie_debug_header_name': string_cracker, }, 'hammiefilter' : {'hammiefilter_persistent_use_database': boolean_cracker, 'hammiefilter_persistent_storage_file': string_cracker, }, 'pop3proxy': {'pop3proxy_servers': string_cracker, 'pop3proxy_ports': string_cracker, 'pop3proxy_server_name': string_cracker, 'pop3proxy_server_port': int_cracker, 'pop3proxy_port': int_cracker, 'pop3proxy_cache_use_gzip': boolean_cracker, 'pop3proxy_cache_expiry_days': int_cracker, 'pop3proxy_spam_cache': string_cracker, 'pop3proxy_ham_cache': string_cracker, 'pop3proxy_unknown_cache': string_cracker, 'pop3proxy_persistent_use_database': boolean_cracker, 'pop3proxy_persistent_storage_file': string_cracker, }, 'html_ui': {'html_ui_port': int_cracker, 'html_ui_launch_browser': boolean_cracker, }, 'globals': {'verbose': boolean_cracker, 'dbm_type': string_cracker, }, } def _warn(msg): print >> sys.stderr, msg class OptionsClass(object): def __init__(self): self._config = ConfigParser.ConfigParser() def mergefiles(self, fnamelist): self._config.read(fnamelist) self._update() def mergefilelike(self, filelike): self._config.readfp(filelike) self._update() def _update(self): nerrors = 0 c = self._config for section in c.sections(): if section not in all_options: _warn("config file has unknown section %r" % section) nerrors += 1 continue goodopts = all_options[section] for option in c.options(section): if option not in goodopts: _warn("config file has unknown option %r in " "section %r" % (option, section)) nerrors += 1 continue fetcher, converter = goodopts[option] value = getattr(c, fetcher)(section, option) if converter is not None: value = converter(value) setattr(options, option, value) if nerrors: raise ValueError("errors while parsing .ini file") def display(self): output = StringIO.StringIO() self._config.write(output) return output.getvalue() options = OptionsClass() d = StringIO.StringIO(defaults) options.mergefilelike(d) del d alternate = None if hasattr(os, 'getenv'): alternate = os.getenv('BAYESCUSTOMIZE') if alternate: options.mergefiles(alternate.split()) else: options.mergefiles(['bayescustomize.ini']) --- NEW FILE: TestDriver.py --- # Loop: # Optional: # # Set up a new base classifier for testing. # new_classifier(), or set_classifier() # # Run tests against (possibly variants of) this classifier. # Loop: # Loop: # Optional: # # train on more ham and spam # train(ham, spam) # Optional: # # Forget training for some subset of ham and spam. # untrain(ham, spam) # # Predict against other data. # Loop: # test(ham, spam) # # Display stats against all runs on this classifier variant. # # This also saves the trained classifer, if desired (option # # save_trained_pickles). # finishtest() # # Display stats against all runs. # alldone() from sets import Set import cPickle as pickle from heapq import heapreplace from spambayes.Options import options from spambayes import Tester from spambayes import classifier from spambayes.Histogram import Hist try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 def printhist(tag, ham, spam, nbuckets=options.nbuckets): print print "-> Ham scores for", tag, ham.display(nbuckets) print print "-> Spam scores for", tag, spam.display(nbuckets) if not options.compute_best_cutoffs_from_histograms: return if ham.n == 0 or spam.n == 0: return # Figure out "the best" ham & spam cutoff points, meaning the ones that # minimize # num_fp * fp_weight + num_fn + fn_weight + num_unsure * unsure_weight # the total number of misclassified msgs (other definitions are # certainly possible!). # At cutoff 0, everything is called spam, so there are no false negatives, # and every ham is a false positive. assert ham.nbuckets == spam.nbuckets n = ham.nbuckets FPW = options.best_cutoff_fp_weight FNW = options.best_cutoff_fn_weight UNW = options.best_cutoff_unsure_weight # Get running totals: {h,s}total[i] is # of ham/spam below bucket i htotal = [0] * (n+1) stotal = [0] * (n+1) for i in range(1, n+1): htotal[i] = htotal[i-1] + ham.buckets[i-1] stotal[i] = stotal[i-1] + spam.buckets[i-1] assert htotal[-1] == ham.n assert stotal[-1] == spam.n best_cost = 1e200 # infinity bests = [] # best h and s cutoffs for h in range(n+1): num_fn = stotal[h] fn_cost = num_fn * FNW for s in xrange(h, n+1): # ham 0:h correct # h:s unsure # s: FP # spam 0:h FN # h:s unsure # s: correct num_fp = htotal[-1] - htotal[s] num_un = htotal[s] - htotal[h] + stotal[s] - stotal[h] cost = num_fp * FPW + fn_cost + num_un * UNW if cost <= best_cost: if cost < best_cost: best_cost = cost bests = [] bests.append((h, s)) print '-> best cost for %s $%.2f' % (tag, best_cost) print '-> per-fp cost $%.2f; per-fn cost $%.2f; per-unsure cost $%.2f' % ( FPW, FNW, UNW) if len(bests) > 1: print '-> achieved at', len(bests), 'cutoff pairs' info = [('smallest ham & spam cutoffs', bests[0]), ('largest ham & spam cutoffs', bests[-1])] else: info = [('achieved at ham & spam cutoffs', bests[0])] for tag, (h, s) in info: print '-> %s %g & %g' % (tag, float(h)/n, float(s)/n) num_fn = stotal[h] num_fp = htotal[-1] - htotal[s] num_unh = htotal[s] - htotal[h] num_uns = stotal[s] - stotal[h] print '-> fp %d; fn %d; unsure ham %d; unsure spam %d' % ( num_fp, num_fn, num_unh, num_uns) print '-> fp rate %.3g%%; fn rate %.3g%%; unsure rate %.3g%%' % ( num_fp*1e2 / ham.n, num_fn*1e2 / spam.n, (num_unh + num_uns)*1e2 / (ham.n + spam.n)) return float(bests[0][0])/n,float(bests[0][1])/n def printmsg(msg, prob, clues): print msg.tag print "prob =", prob for clue in clues: print "prob(%r) = %g" % clue print guts = str(msg) if options.show_charlimit > 0: guts = guts[:options.show_charlimit] print guts class Driver: def __init__(self): self.falsepos = Set() self.falseneg = Set() self.unsure = Set() self.global_ham_hist = Hist() self.global_spam_hist = Hist() self.ntimes_finishtest_called = 0 self.new_classifier() from spambayes import CostCounter self.cc=CostCounter.default() def new_classifier(self): """Create and use a new, virgin classifier.""" self.set_classifier(classifier.Bayes()) def set_classifier(self, classifier): """Specify a classifier to be used for further testing.""" self.classifier = classifier self.tester = Tester.Test(classifier) self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() def train(self, ham, spam): print "-> Training on", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.train(ham, spam) print c.nham - nham, "hams &", c.nspam- nspam, "spams" def untrain(self, ham, spam): print "-> Forgetting", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.untrain(ham, spam) print nham - c.nham, "hams &", nspam - c.nspam, "spams" def finishtest(self): if options.show_histograms: printhist("all in this training set:", self.trained_ham_hist, self.trained_spam_hist) self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() self.ntimes_finishtest_called += 1 if options.save_trained_pickles: fname = "%s%d.pik" % (options.pickle_basename, self.ntimes_finishtest_called) print " saving pickle to", fname fp = file(fname, 'wb') pickle.dump(self.classifier, fp, 1) fp.close() def alldone(self): if options.show_histograms: besthamcut,bestspamcut = printhist("all runs:", self.global_ham_hist, self.global_spam_hist) else: besthamcut = options.ham_cutoff bestspamcut = options.spam_cutoff nham = self.global_ham_hist.n nspam = self.global_spam_hist.n nfp = len(self.falsepos) nfn = len(self.falseneg) nun = len(self.unsure) print "-> all runs false positives:", nfp print "-> all runs false negatives:", nfn print "-> all runs unsure:", nun print "-> all runs false positive %:", (nfp * 1e2 / nham) print "-> all runs false negative %:", (nfn * 1e2 / nspam) print "-> all runs unsure %:", (nun * 1e2 / (nham + nspam)) print "-> all runs cost: $%.2f" % ( nfp * options.best_cutoff_fp_weight + nfn * options.best_cutoff_fn_weight + nun * options.best_cutoff_unsure_weight) # Set back the options for the delayed calculations in self.cc options.ham_cutoff = besthamcut options.spam_cutoff = bestspamcut print self.cc if options.save_histogram_pickles: for f, h in (('ham', self.global_ham_hist), ('spam', self.global_spam_hist)): fname = "%s_%shist.pik" % (options.pickle_basename, f) print " saving %s histogram pickle to %s" %(f, fname) fp = file(fname, 'wb') pickle.dump(h, fp, 1) fp.close() def test(self, ham, spam): c = self.classifier t = self.tester local_ham_hist = Hist() local_spam_hist = Hist() def new_ham(msg, prob, lo=options.show_ham_lo, hi=options.show_ham_hi): local_ham_hist.add(prob * 100.0) self.cc.ham(prob) if lo <= prob <= hi: print print "Ham with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) def new_spam(msg, prob, lo=options.show_spam_lo, hi=options.show_spam_hi): local_spam_hist.add(prob * 100.0) self.cc.spam(prob) if lo <= prob <= hi: print print "Spam with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) t.reset_test_results() print "-> Predicting", ham, "&", spam, "..." t.predict(spam, True, new_spam) t.predict(ham, False, new_ham) print "-> tested", t.nham_tested, "hams &", t.nspam_tested, \ "spams against", c.nham, "hams &", c.nspam, "spams" print "-> false positive %:", t.false_positive_rate() print "-> false negative %:", t.false_negative_rate() print "-> unsure %:", t.unsure_rate() print "-> cost: $%.2f" % ( t.nham_wrong * options.best_cutoff_fp_weight + t.nspam_wrong * options.best_cutoff_fn_weight + (t.nham_unsure + t.nspam_unsure) * options.best_cutoff_unsure_weight) newfpos = Set(t.false_positives()) - self.falsepos self.falsepos |= newfpos print "-> %d new false positives" % len(newfpos) if newfpos: print " new fp:", [e.tag for e in newfpos] if not options.show_false_positives: newfpos = () for e in newfpos: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newfneg = Set(t.false_negatives()) - self.falseneg self.falseneg |= newfneg print "-> %d new false negatives" % len(newfneg) if newfneg: print " new fn:", [e.tag for e in newfneg] if not options.show_false_negatives: newfneg = () for e in newfneg: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newunsure = Set(t.unsures()) - self.unsure self.unsure |= newunsure print "-> %d new unsure" % len(newunsure) if newunsure: print " new unsure:", [e.tag for e in newunsure] if not options.show_unsure: newunsure = () for e in newunsure: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) if options.show_histograms: printhist("this pair:", local_ham_hist, local_spam_hist) self.trained_ham_hist += local_ham_hist self.trained_spam_hist += local_spam_hist --- NEW FILE: Tester.py --- from spambayes.Options import options try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 class Test: # Pass a classifier instance (an instance of Bayes). # Loop: # # Train the classifer with new ham and spam. # train(ham, spam) # this implies reset_test_results # Loop: # Optional: # # Possibly fiddle the classifier. # set_classifier() # # Forget smessages the classifier was trained on. # untrain(ham, spam) # this implies reset_test_results # Optional: # reset_test_results() # # Predict against (presumably new) examples. # predict(ham, spam) # Optional: # suck out the results, via instance vrbls and # false_negative_rate(), false_positive_rate(), # false_negatives(), and false_positives() def __init__(self, classifier): self.set_classifier(classifier) self.reset_test_results() # Tell the tester which classifier to use. def set_classifier(self, classifier): self.classifier = classifier def reset_test_results(self): # The number of ham and spam instances tested. self.nham_tested = self.nspam_tested = 0 # The number of test instances correctly and incorrectly classified. self.nham_right = 0 self.nham_wrong = 0 self.nham_unsure = 0; self.nspam_right = 0 self.nspam_wrong = 0 self.nspam_unsure = 0; # Lists of bad predictions. self.ham_wrong_examples = [] # False positives: ham called spam. self.spam_wrong_examples = [] # False negatives: spam called ham. self.unsure_examples = [] # ham and spam in middle ground # Train the classifier on streams of ham and spam. Updates probabilities # before returning, and resets test results. def train(self, hamstream=None, spamstream=None): self.reset_test_results() learn = self.classifier.learn if hamstream is not None: for example in hamstream: learn(example, False) if spamstream is not None: for example in spamstream: learn(example, True) # Untrain the classifier on streams of ham and spam. Updates # probabilities before returning, and resets test results. def untrain(self, hamstream=None, spamstream=None): self.reset_test_results() unlearn = self.classifier.unlearn if hamstream is not None: for example in hamstream: unlearn(example, False) if spamstream is not None: for example in spamstream: unlearn(example, True) # Run prediction on each sample in stream. You're swearing that stream # is entirely composed of spam (is_spam True), or of ham (is_spam False). # Note that mispredictions are saved, and can be retrieved later via # false_negatives (spam mistakenly called ham) and false_positives (ham # mistakenly called spam). For this reason, you may wish to wrap examples # in a little class that identifies the example in a useful way, and whose # __iter__ produces a token stream for the classifier. # # If specified, callback(msg, spam_probability) is called for each # msg in the stream, after the spam probability is computed. def predict(self, stream, is_spam, callback=None): guess = self.classifier.spamprob for example in stream: prob = guess(example) if callback: callback(example, prob) is_ham_guessed = prob < options.ham_cutoff is_spam_guessed = prob >= options.spam_cutoff if is_spam: self.nspam_tested += 1 if is_spam_guessed: self.nspam_right += 1 elif is_ham_guessed: self.nspam_wrong += 1 self.spam_wrong_examples.append(example) else: self.nspam_unsure += 1 self.unsure_examples.append(example) else: self.nham_tested += 1 if is_ham_guessed: self.nham_right += 1 elif is_spam_guessed: self.nham_wrong += 1 self.ham_wrong_examples.append(example) else: self.nham_unsure += 1 self.unsure_examples.append(example) assert (self.nham_right + self.nham_wrong + self.nham_unsure == self.nham_tested) assert (self.nspam_right + self.nspam_wrong + self.nspam_unsure == self.nspam_tested) def false_positive_rate(self): """Percentage of ham mistakenly identified as spam, in 0.0..100.0.""" return self.nham_wrong * 1e2 / (self.nham_tested or 1) def false_negative_rate(self): """Percentage of spam mistakenly identified as ham, in 0.0..100.0.""" return self.nspam_wrong * 1e2 / (self.nspam_tested or 1) def unsure_rate(self): return ((self.nham_unsure + self.nspam_unsure) * 1e2 / ((self.nham_tested + self.nspam_tested) or 1)) def false_positives(self): return self.ham_wrong_examples def false_negatives(self): return self.spam_wrong_examples def unsures(self): return self.unsure_examples class _Example: def __init__(self, name, words): self.name = name self.words = words def __iter__(self): return iter(self.words) _easy_test = """ >>> from spambayes.classifier import Bayes >>> from spambayes.Options import options >>> options.ham_cutoff = options.spam_cutoff = 0.5 >>> good1 = _Example('', ['a', 'b', 'c']) >>> good2 = _Example('', ['a', 'b']) >>> bad1 = _Example('', ['c', 'd']) >>> t = Test(Bayes()) >>> t.train([good1, good2], [bad1]) >>> t.predict([_Example('goodham', ['a', 'b']), ... _Example('badham', ['d']) # FP ... ], False) >>> t.predict([_Example('goodspam', ['d']), ... _Example('badspam1', ['a']), # FN ... _Example('badspam2', ['a', 'b']), # FN ... _Example('badspam3', ['d', 'a', 'b']) # FN ... ], True) >>> t.nham_tested 2 >>> t.nham_right, t.nham_wrong (1, 1) >>> t.false_positive_rate() 50.0 >>> [e.name for e in t.false_positives()] ['badham'] >>> t.nspam_tested 4 >>> t.nspam_right, t.nspam_wrong (1, 3) >>> t.false_negative_rate() 75.0 >>> [e.name for e in t.false_negatives()] ['badspam1', 'badspam2', 'badspam3'] >>> [e.name for e in t.unsures()] [] >>> t.unsure_rate() 0.0 """ __test__ = {'easy': _easy_test} def _test(): import doctest, Tester doctest.testmod(Tester) if __name__ == '__main__': _test() --- NEW FILE: __init__.py --- # package marker. --- NEW FILE: cdb.py --- #! /usr/bin/env python """ Dan Bernstein's CDB implemented in Python see http://cr.yp.to/cdb.html """ from __future__ import generators import os import struct import mmap import sys def uint32_unpack(buf): return struct.unpack('>= 8 u %= self.hslots u <<= 3 self.kpos = self.hpos + u while self.loop < self.hslots: buf = self.read(8, self.kpos) pos = uint32_unpack(buf[4:]) if not pos: raise KeyError self.loop += 1 self.kpos += 8 if self.kpos == self.hpos + (self.hslots << 3): self.kpos = self.hpos u = uint32_unpack(buf[:4]) if u == self.khash: buf = self.read(8, pos) u = uint32_unpack(buf[:4]) if u == len(key): if self.match(key, pos + 8): dlen = uint32_unpack(buf[4:]) dpos = pos + 8 + len(key) return self.read(dlen, dpos) raise KeyError def __getitem__(self, key): self.findstart() return self.findnext(key) def get(self, key, default=None): self.findstart() try: return self.findnext(key) except KeyError: return default def cdb_dump(infile): """dump a database in djb's cdbdump format""" db = Cdb(infile) for key,value in db.iteritems(): print "+%d,%d:%s->%s" % (len(key), len(value), key, value) print def cdb_make(outfile, items): pos = 2048 tables = {} # { h & 255 : [(h, p)] } # write keys and data outfile.seek(pos) for key, value in items: outfile.write(uint32_pack(len(key)) + uint32_pack(len(value))) h = cdb_hash(key) outfile.write(key) outfile.write(value) tables.setdefault(h & 255, []).append((h, pos)) pos += 8 + len(key) + len(value) final = '' # write hash tables for i in range(256): entries = tables.get(i, []) nslots = 2*len(entries) final += uint32_pack(pos) + uint32_pack(nslots) null = (0, 0) table = [null] * nslots for h, p in entries: n = (h >> 8) % nslots while table[n] is not null: n = (n + 1) % nslots table[n] = (h, p) for h, p in table: outfile.write(uint32_pack(h) + uint32_pack(p)) pos += 8 # write header (pointers to tables and their lengths) outfile.flush() outfile.seek(0) outfile.write(final) def test(): #db = Cdb(open("t")) #print db['one'] #print db['two'] #print db['foo'] #print db['us'] #print db.get('ec') #print db.get('notthere') db = open('test.cdb', 'wb') cdb_make(db, [('one', 'Hello'), ('two', 'Goodbye'), ('foo', 'Bar'), ('us', 'United States'), ]) db.close() db = Cdb(open("test.cdb", 'rb')) print db['one'] print db['two'] print db['foo'] print db['us'] print db.get('ec') print db.get('notthere') if __name__ == '__main__': test() --- NEW FILE: chi2.py --- import math as _math try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 def chi2Q(x2, v, exp=_math.exp, min=min): """Return prob(chisq >= x2, with v degrees of freedom). v must be even. """ assert v & 1 == 0 # XXX If x2 is very large, exp(-m) will underflow to 0. m = x2 / 2.0 sum = term = exp(-m) for i in range(1, v//2): term *= m / i sum += term # With small x2 and large v, accumulated roundoff error, plus error in # the platform exp(), can cause this to spill a few ULP above 1.0. For # example, chi2Q(100, 300) on my box has sum == 1.0 + 2.0**-52 at this # point. Returning a value even a teensy bit over 1.0 is no good. return min(sum, 1.0) def normZ(z, sqrt2pi=_math.sqrt(2.0*_math.pi), exp=_math.exp): "Return value of the unit Gaussian at z." return exp(-z*z/2.0) / sqrt2pi def normP(z): """Return area under the unit Gaussian from -inf to z. This is the probability that a zscore is <= z. """ # This is very accurate in a fixed-point sense. For negative z of # large magnitude (<= -8.3), it returns 0.0, essentially because # P(-z) is, to machine precision, indistiguishable from 1.0 then. # sum <- area from 0 to abs(z). a = abs(float(z)) if a >= 8.3: sum = 0.5 else: sum2 = term = a * normZ(a) z2 = a*a sum = 0.0 i = 1.0 while sum != sum2: sum = sum2 i += 2.0 term *= z2 / i sum2 += term if z >= 0: result = 0.5 + sum else: result = 0.5 - sum return result def normIQ(p, sqrt=_math.sqrt, ln=_math.log): """Return z such that the area under the unit Gaussian from z to +inf is p. Must have 0.0 <= p <= 1.0. """ assert 0.0 <= p <= 1.0 # This is a low-accuracy rational approximation from Abramowitz & Stegun. # The absolute error is bounded by 3e-3. flipped = False if p > 0.5: flipped = True p = 1.0 - p if p == 0.0: z = 8.3 else: t = sqrt(-2.0 * ln(p)) z = t - (2.30753 + .27061*t) / (1. + .99229*t + .04481*t**2) if flipped: z = -z return z def normIP(p): """Return z such that the area under the unit Gaussian from -inf to z is p. Must have 0.0 <= p <= 1.0. """ z = normIQ(1.0 - p) # One Newton step should double the # of good digits. return z + (p - normP(z)) / normZ(z) def main(): from spambayes.Histogram import Hist import sys class WrappedRandom: # There's no way W-H is equidistributed in 50 dimensions, so use # Marsaglia-wrapping to shuffle it more. def __init__(self, baserandom=random.random, tabsize=513): self.baserandom = baserandom self.n = tabsize self.tab = [baserandom() for i in range(tabsize)] self.next = baserandom() def random(self): result = self.next i = int(result * self.n) self.next = self.tab[i] self.tab[i] = self.baserandom() return result random = WrappedRandom().random #from uni import uni as random #print random def judge(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp): H = S = 1.0 Hexp = Sexp = 0 for p in ps: S *= 1.0 - p H *= p if S < 1e-200: S, e = frexp(S) Sexp += e if H < 1e-200: H, e = frexp(H) Hexp += e S = ln(S) + Sexp * ln2 H = ln(H) + Hexp * ln2 n = len(ps) S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) return S, H, (S-H + 1.0) / 2.0 warp = 0 bias = 0.99 if len(sys.argv) > 1: warp = int(sys.argv[1]) if len(sys.argv) > 2: bias = float(sys.argv[2]) h = Hist(20, lo=0.0, hi=1.0) s = Hist(20, lo=0.0, hi=1.0) score = Hist(20, lo=0.0, hi=1.0) for i in range(5000): ps = [random() for j in range(50)] s1, h1, score1 = judge(ps + [bias] * warp) s.add(s1) h.add(h1) score.add(score1) print "Result for random vectors of 50 probs, +", warp, "forced to", bias # Should be uniformly distributed on all-random data. print print 'H', h.display() # Should be uniformly distributed on all-random data. print print 'S', s.display() # Distribution doesn't really matter. print print '(S-H+1)/2', score.display() def showscore(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp): H = S = 1.0 Hexp = Sexp = 0 for p in ps: S *= 1.0 - p H *= p if S < 1e-200: S, e = frexp(S) Sexp += e if H < 1e-200: H, e = frexp(H) Hexp += e S = ln(S) + Sexp * ln2 H = ln(H) + Hexp * ln2 n = len(ps) probS = chi2Q(-2*S, 2*n) probH = chi2Q(-2*H, 2*n) print "P(chisq >= %10g | v=%3d) = %10g" % (-2*S, 2*n, probS) print "P(chisq >= %10g | v=%3d) = %10g" % (-2*H, 2*n, probH) S = 1.0 - probS H = 1.0 - probH score = (S-H + 1.0) / 2.0 print "spam prob", S print " ham prob", H print "(S-H+1)/2", score if __name__ == '__main__': import random main() --- NEW FILE: classifier.py --- #! /usr/bin/env python # An implementation of a Bayes-like spam classifier. # # Paul Graham's original description: # # http://www.paulgraham.com/spam.html # # A highly fiddled version of that can be retrieved from our CVS repository, # via tag Last-Graham. This made many demonstrated improvements in error # rates over Paul's original description. # # This code implements Gary Robinson's suggestions, the core of which are # well explained on his webpage: # # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html # # This is theoretically cleaner, and in testing has performed at least as # well as our highly tuned Graham scheme did, often slightly better, and # sometimes much better. It also has "a middle ground", which people like: # the scores under Paul's scheme were almost always very near 0 or very near # 1, whether or not the classification was correct. The false positives # and false negatives under Gary's basic scheme (use_gary_combining) generally # score in a narrow range around the corpus's best spam_cutoff value. # However, it doesn't appear possible to guess the best spam_cutoff value in # advance, and it's touchy. # # The chi-combining scheme used by default here gets closer to the theoretical # basis of Gary's combining scheme, and does give extreme scores, but also # has a very useful middle ground (small # of msgs spread across a large range # of scores, and good cutoff values aren't touchy). # # This implementation is due to Tim Peters et alia. import math from sets import Set from spambayes.Options import options from spambayes.chi2 import chi2Q try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 LN2 = math.log(2) # used frequently by chi-combining PICKLE_VERSION = 5 class WordInfo(object): # Invariant: For use in a classifier database, at least one of # spamcount and hamcount must be non-zero. def __init__(self): self.__setstate__((0, 0)) def __repr__(self): return "WordInfo%r" % repr((self.spamcount, self.hamcount)) def __getstate__(self): return (self.spamcount, self.hamcount) def __setstate__(self, t): (self.spamcount, self.hamcount) = t class Classifier: # Defining __slots__ here made Jeremy's life needlessly difficult when # trying to hook this all up to ZODB as a persistent object. There's # no space benefit worth getting from slots in this class; slots were # used solely to help catch errors earlier, when this code was changing # rapidly. #__slots__ = ('wordinfo', # map word to WordInfo record # 'nspam', # number of spam messages learn() has seen # 'nham', # number of non-spam messages learn() has seen # ) # allow a subclass to use a different class for WordInfo WordInfoClass = WordInfo def __init__(self): self.wordinfo = {} self.probcache = {} self.nspam = self.nham = 0 def __getstate__(self): return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham) def __setstate__(self, t): if t[0] != PICKLE_VERSION: raise ValueError("Can't unpickle -- version %s unknown" % t[0]) (self.wordinfo, self.nspam, self.nham) = t[1:] self.probcache = {} # spamprob() implementations. One of the following is aliased to # spamprob, depending on option settings. def gary_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. """ from math import frexp # This combination method is due to Gary Robinson; see # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html # The real P = this P times 2**Pexp. Likewise for Q. We're # simulating unbounded dynamic float range by hand. If this pans # out, *maybe* we should store logarithms in the database instead # and just add them here. But I like keeping raw counts in the # database (they're easy to understand, manipulate and combine), # and there's no evidence that this simulation is a significant # expense. P = Q = 1.0 Pexp = Qexp = 0 clues = self._getclues(wordstream) for prob, word, record in clues: P *= 1.0 - prob Q *= prob if P < 1e-200: # move back into range P, e = frexp(P) Pexp += e if Q < 1e-200: # move back into range Q, e = frexp(Q) Qexp += e P, e = frexp(P) Pexp += e Q, e = frexp(Q) Qexp += e num_clues = len(clues) if num_clues: #P = 1.0 - P**(1./num_clues) #Q = 1.0 - Q**(1./num_clues) # # (x*2**e)**n = x**n * 2**(e*n) n = 1.0 / num_clues P = 1.0 - P**n * 2.0**(Pexp * n) Q = 1.0 - Q**n * 2.0**(Qexp * n) # (P-Q)/(P+Q) is in -1 .. 1; scaling into 0 .. 1 gives # ((P-Q)/(P+Q)+1)/2 = # ((P-Q+P-Q)/(P+Q)/2 = # (2*P/(P+Q)/2 = # P/(P+Q) prob = P/(P+Q) else: prob = 0.5 if evidence: clues = [(w, p) for p, w, r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) return prob, clues else: return prob if options.use_gary_combining: spamprob = gary_spamprob # Across vectors of length n, containing random uniformly-distributed # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution # with 2*n degrees of freedom. This has been proven (in some # appropriate sense) to be the most sensitive possible test for # rejecting the hypothesis that a vector of probabilities is uniformly # distributed. Gary Robinson's original scheme was monotonic *with* # this test, but skipped the details. Turns out that getting closer # to the theoretical roots gives a much sharper classification, with # a very small (in # of msgs), but also very broad (in range of scores), # "middle ground", where most of the mistakes live. In particular, # this scheme seems immune to all forms of "cancellation disease": if # there are many strong ham *and* spam clues, this reliably scores # close to 0.5. Most other schemes are extremely certain then -- and # often wrong. def chi2_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. """ from math import frexp, log as ln # We compute two chi-squared statistics, one for ham and one for # spam. The sum-of-the-logs business is more sensitive to probs # near 0 than to probs near 1, so the spam measure uses 1-p (so # that high-spamprob words have greatest effect), and the ham # measure uses p directly (so that lo-spamprob words have greatest # effect). # # For optimization, sum-of-logs == log-of-product, and f.p. # multiplication is a lot cheaper than calling ln(). It's easy # to underflow to 0.0, though, so we simulate unbounded dynamic # range via frexp. The real product H = this H * 2**Hexp, and # likewise the real product S = this S * 2**Sexp. H = S = 1.0 Hexp = Sexp = 0 clues = self._getclues(wordstream) for prob, word, record in clues: S *= 1.0 - prob H *= prob if S < 1e-200: # prevent underflow S, e = frexp(S) Sexp += e if H < 1e-200: # prevent underflow H, e = frexp(H) Hexp += e # Compute the natural log of the product = sum of the logs: # ln(x * 2**i) = ln(x) + i * ln(2). S = ln(S) + Sexp * LN2 H = ln(H) + Hexp * LN2 n = len(clues) if n: S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) # How to combine these into a single spam score? We originally # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A # systematic problem is that we could end up being near-certain # a thing was (for example) spam, even if S was small, provided # that H was much smaller. # Rob Hooft stared at these problems and invented the measure # we use now, the simpler S-H, scaled into [0., 1.]. prob = (S-H + 1.0) / 2.0 else: prob = 0.5 if evidence: clues = [(w, p) for p, w, r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) clues.insert(0, ('*S*', S)) clues.insert(0, ('*H*', H)) return prob, clues else: return prob if options.use_chi_squared_combining: spamprob = chi2_spamprob def learn(self, wordstream, is_spam): """Teach the classifier by example. wordstream is a word stream representing a message. If is_spam is True, you're telling the classifier this message is definitely spam, else that it's definitely not spam. """ self._add_msg(wordstream, is_spam) def unlearn(self, wordstream, is_spam): """In case of pilot error, call unlearn ASAP after screwing up. Pass the same arguments you passed to learn(). """ self._remove_msg(wordstream, is_spam) def probability(self, record): """Compute, store, and return prob(msg is spam | msg contains word). This is the Graham calculation, but stripped of biases, and stripped of clamping into 0.01 thru 0.99. The Bayesian adjustment following keeps them in a sane range, and one that naturally grows the more evidence there is to back up a probability. """ spamcount = record.spamcount hamcount = record.hamcount # Try the cache first try: return self.probcache[spamcount][hamcount] except KeyError: pass nham = float(self.nham or 1) nspam = float(self.nspam or 1) assert hamcount <= nham hamratio = hamcount / nham assert spamcount <= nspam spamratio = spamcount / nspam prob = spamratio / (hamratio + spamratio) if options.experimental_ham_spam_imbalance_adjustment: spam2ham = min(nspam / nham, 1.0) ham2spam = min(nham / nspam, 1.0) else: spam2ham = ham2spam = 1.0 S = options.unknown_word_strength StimesX = S * options.unknown_word_prob # Now do Robinson's Bayesian adjustment. # # s*x + n*p(w) # f(w) = -------------- # s + n # # I find this easier to reason about like so (equivalent when # s != 0): # # x - p # p + ------- # 1 + n/s # # IOW, it moves p a fraction of the distance from p to x, and # less so the larger n is, or the smaller s is. # Experimental: # Picking a good value for n is interesting: how much empirical # evidence do we really have? If nham == nspam, # hamcount + spamcount makes a lot of sense, and the code here # does that by default. # But if, e.g., nham is much larger than nspam, p(w) can get a # lot closer to 0.0 than it can get to 1.0. That in turn makes # strong ham words (high hamcount) much stronger than strong # spam words (high spamcount), and that makes the accidental # appearance of a strong ham word in spam much more damaging than # the accidental appearance of a strong spam word in ham. # So we don't give hamcount full credit when nham > nspam (or # spamcount when nspam > nham): instead we knock hamcount down # to what it would have been had nham been equal to nspam. IOW, # we multiply hamcount by nspam/nham when nspam < nham; or, IOOW, # we don't "believe" any count to an extent more than # min(nspam, nham) justifies. n = hamcount * spam2ham + spamcount * ham2spam prob = (StimesX + n * prob) / (S + n) # Update the cache try: self.probcache[spamcount][hamcount] = prob except KeyError: self.probcache[spamcount] = {hamcount: prob} return prob # NOTE: Graham's scheme had a strange asymmetry: when a word appeared # n>1 times in a single message, training added n to the word's hamcount # or spamcount, but predicting scored words only once. Tests showed # that adding only 1 in training, or scoring more than once when # predicting, hurt under the Graham scheme. # This isn't so under Robinson's scheme, though: results improve # if training also counts a word only once. The mean ham score decreases # significantly and consistently, ham score variance decreases likewise, # mean spam score decreases (but less than mean ham score, so the spread # increases), and spam score variance increases. # I (Tim) speculate that adding n times under the Graham scheme helped # because it acted against the various ham biases, giving frequently # repeated spam words (like "Viagra") a quick ramp-up in spamprob; else, # adding only once in training, a word like that was simply ignored until # it appeared in 5 distinct training spams. Without the ham-favoring # biases, though, and never ignoring words, counting n times introduces # a subtle and unhelpful bias. # There does appear to be some useful info in how many times a word # appears in a msg, but distorting spamprob doesn't appear a correct way # to exploit it. def _add_msg(self, wordstream, is_spam): self.probcache = {} # nuke the prob cache if is_spam: self.nspam += 1 else: self.nham += 1 for word in Set(wordstream): record = self._wordinfoget(word) if record is None: record = self.WordInfoClass() if is_spam: record.spamcount += 1 else: record.hamcount += 1 self._wordinfoset(word, record) def _remove_msg(self, wordstream, is_spam): self.probcache = {} # nuke the prob cache if is_spam: if self.nspam <= 0: raise ValueError("spam count would go negative!") self.nspam -= 1 else: if self.nham <= 0: raise ValueError("non-spam count would go negative!") self.nham -= 1 for word in Set(wordstream): record = self._wordinfoget(word) if record is not None: if is_spam: if record.spamcount > 0: record.spamcount -= 1 else: if record.hamcount > 0: record.hamcount -= 1 if record.hamcount == 0 == record.spamcount: self._wordinfodel(word) else: self._wordinfoset(word, record) def _getclues(self, wordstream): mindist = options.minimum_prob_strength unknown = options.unknown_word_prob clues = [] # (distance, prob, word, record) tuples pushclue = clues.append for word in Set(wordstream): record = self._wordinfoget(word) if record is None: prob = unknown else: prob = self.probability(record) distance = abs(prob - 0.5) if distance >= mindist: pushclue((distance, prob, word, record)) clues.sort() if len(clues) > options.max_discriminators: del clues[0 : -options.max_discriminators] # Return (prob, word, record). return [t[1:] for t in clues] def _wordinfoget(self, word): return self.wordinfo.get(word) def _wordinfoset(self, word, record): self.wordinfo[word] = record def _wordinfodel(self, word): del self.wordinfo[word] Bayes = Classifier --- NEW FILE: dbmstorage.py --- """Wrapper to open an appropriate dbm storage type.""" from spambayes.Options import options import sys class error(Exception): pass def open_db3hash(*args): """Open a bsddb3 hash.""" import bsddb3 return bsddb3.hashopen(*args) def open_dbhash(*args): """Open a bsddb hash. Don't use this on Windows.""" import bsddb return bsddb.hashopen(*args) def open_gdbm(*args): """Open a gdbm database.""" import gdbm return gdbm.open(*args) def open_dumbdbm(*args): """Open a dumbdbm database.""" import dumbdbm return dumbdbm.open(*args) def open_best(*args): if sys.platform == "win32": funcs = [open_db3hash, open_gdbm, open_dumbdbm] else: funcs = [open_db3hash, open_dbhash, open_gdbm, open_dumbdbm] for f in funcs: try: return f(*args) except ImportError: pass raise error("No dbm modules available!") open_funcs = { "best": open_best, "db3hash": open_db3hash, "dbhash": open_dbhash, "gdbm": open_gdbm, "dumbdbm": open_dumbdbm, } def open(*args): dbm_type = options.dbm_type.lower() f = open_funcs.get(dbm_type) if not f: raise error("Unknown dbm type in options file") return f(*args) --- NEW FILE: hammie.py --- #! /usr/bin/env python from spambayes import mboxutils from spambayes import storage from spambayes.Options import options from spambayes.tokenizer import tokenize try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 class Hammie: """A spambayes mail filter. This implements the basic functionality needed to score, filter, or train. """ def __init__(self, bayes): self.bayes = bayes def _scoremsg(self, msg, evidence=False): """Score a Message. msg can be a string, a file object, or a Message object. Returns the probability the message is spam. If evidence is true, returns a tuple: (probability, clues), where clues is a list of the words which contributed to the score. """ return self.bayes.spamprob(tokenize(msg), evidence) def formatclues(self, clues, sep="; "): """Format the clues into something readable.""" return sep.join(["%r: %.2f" % (word, prob) for word, prob in clues if (word[0] == '*' or prob <= options.clue_mailheader_cutoff or prob >= 1.0 - options.clue_mailheader_cutoff)]) def score(self, msg, evidence=False): """Score (judge) a message. msg can be a string, a file object, or a Message object. Returns the probability the message is spam. If evidence is true, returns a tuple: (probability, clues), where clues is a list of the words which contributed to the score. """ return self._scoremsg(msg, evidence) def filter(self, msg, header=None, spam_cutoff=None, ham_cutoff=None, debugheader=None, debug=None): """Score (judge) a message and add a disposition header. msg can be a string, a file object, or a Message object. Optionally, set header to the name of the header to add, and/or spam_cutoff/ham_cutoff to the probability values which must be met or exceeded for a message to get a 'Spam' or 'Ham' classification. An extra debugging header can be added if 'debug' is set to True. The name of the debugging header is given as 'debugheader'. All defaults for optional parameters come from the Options file. Returns the same message with a new disposition header. """ if header == None: header = options.hammie_header_name if spam_cutoff == None: spam_cutoff = options.spam_cutoff if ham_cutoff == None: ham_cutoff = options.ham_cutoff if debugheader == None: debugheader = options.hammie_debug_header_name if debug == None: debug = options.hammie_debug_header msg = mboxutils.get_message(msg) try: del msg[header] except KeyError: pass prob, clues = self._scoremsg(msg, True) if prob < ham_cutoff: disp = options.header_ham_string elif prob > spam_cutoff: disp = options.header_spam_string else: disp = options.header_unsure_string disp += ("; %."+str(options.header_score_digits)+"f") % prob if options.header_score_logarithm: if prob<=0.005 and prob>0.0: import math x=-math.log10(prob) disp += " (%d)"%x if prob>=0.995 and prob<1.0: import math x=-math.log10(1.0-prob) disp += " (%d)"%x msg.add_header(header, disp) if debug: disp = self.formatclues(clues) msg.add_header(debugheader, disp) return msg.as_string(unixfrom=(msg.get_unixfrom() is not None)) def train(self, msg, is_spam): """Train bayes with a message. msg can be a string, a file object, or a Message object. is_spam should be 1 if the message is spam, 0 if not. """ self.bayes.learn(tokenize(msg), is_spam) def untrain(self, msg, is_spam): """Untrain bayes with a message. msg can be a string, a file object, or a Message object. is_spam should be 1 if the message is spam, 0 if not. """ self.bayes.unlearn(tokenize(msg), is_spam) def train_ham(self, msg): """Train bayes with ham. msg can be a string, a file object, or a Message object. """ self.train(msg, False) def train_spam(self, msg): """Train bayes with spam. msg can be a string, a file object, or a Message object. """ self.train(msg, True) def untrain_ham(self, msg): """Untrain bayes with ham. msg can be a string, a file object, or a Message object. """ self.untrain(msg, False) def train_spam(self, msg): """Untrain bayes with spam. msg can be a string, a file object, or a Message object. """ self.untrain(msg, True) def store(self): """Write out the persistent store. This makes sure the persistent store reflects what is currently in memory. You would want to do this after a write and before exiting. """ self.bayes.store() def open(filename, usedb=True, mode='r'): """Open a file, returning a Hammie instance. If usedb is False, open as a pickle instead of a DBDict. mode is used as the flag to open DBDict objects. 'c' for read-write (create if needed), 'r' for read-only, 'w' for read-write. """ if usedb: b = storage.DBDictClassifier(filename, mode) else: b = storage.PickledClassifier(filename) return Hammie(b) if __name__ == "__main__": # Everybody's used to running hammie.py. Why mess with success? ;) import hammiebulk hammiebulk.main() --- NEW FILE: hammiebulk.py --- #! /usr/bin/env python """Usage: %(program)s [-D|-d] [options] Where: -h show usage and exit -d use the DBM store. A DBM file is larger than the pickle and creating it is slower, but loading it is much faster, especially for large word databases. Recommended for use with hammiefilter or any procmail-based filter. -D use the pickle store. A pickle is smaller and faster to create, but much slower to load. Recommended for use with pop3proxy and hammiesrv. -p FILE use file as the persistent store. loads data from this file if it exists, and saves data to this file at the end. Default: %(DEFAULTDB)s -f run as a filter: read a single message from stdin, add a new header, and write it to stdout. If you want to run from procmail, this is your option. -g PATH mbox or directory of known good messages (non-spam) to train on. Can be specified more than once, or use - for stdin. -s PATH mbox or directory of known spam messages to train on. Can be specified more than once, or use - for stdin. -u PATH mbox of unknown messages. A ham/spam decision is reported for each. Can be specified more than once. -r reverse the meaning of the check (report ham instead of spam). Only meaningful with the -u option. """ try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 def bool(val): return not not val import sys import os import types import getopt import mailbox import glob import email import errno import cPickle as pickle from spambayes.Options import options from spambayes import classifier, mboxutils, hammie, Corpus Corpus.Verbose = True program = sys.argv[0] # For usage(); referenced by docstring above # Default database name DEFAULTDB = os.path.expanduser(options.hammiefilter_persistent_storage_file) # Probability at which a message is considered spam SPAM_THRESHOLD = options.spam_cutoff HAM_THRESHOLD = options.ham_cutoff def train(h, msgs, is_spam): """Train bayes with all messages from a mailbox.""" mbox = mboxutils.getmbox(msgs) i = 0 for msg in mbox: i += 1 sys.stdout.write("\r%6d" % i) sys.stdout.flush() h.train(msg, is_spam) print def score(h, msgs, reverse=0): """Score (judge) all messages from a mailbox.""" # XXX The reporting needs work! mbox = mboxutils.getmbox(msgs) i = 0 spams = hams = 0 for msg in mbox: i += 1 prob, clues = h.score(msg, True) if hasattr(msg, '_mh_msgno'): msgno = msg._mh_msgno else: msgno = i isspam = (prob >= SPAM_THRESHOLD) if isspam: spams += 1 if not reverse: print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."), print h.formatclues(clues) else: hams += 1 if reverse: print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."), print h.formatclues(clues) return (spams, hams) def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def main(): """Main program; parse options and go.""" try: opts, args = getopt.getopt(sys.argv[1:], 'hdDfg:s:p:u:r') except getopt.error, msg: usage(2, msg) if not opts: usage(2, "No options given") pck = DEFAULTDB good = [] spam = [] unknown = [] reverse = 0 do_filter = False usedb = None mode = 'r' for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-g': good.append(arg) mode = 'c' elif opt == '-s': spam.append(arg) mode = 'c' elif opt == '-p': pck = arg elif opt == "-d": usedb = True elif opt == "-D": usedb = False elif opt == "-f": do_filter = True elif opt == '-u': unknown.append(arg) elif opt == '-r': reverse = 1 if args: usage(2, "Positional arguments not allowed") if usedb == None: usage(2, "Must specify one of -d or -D") save = False h = hammie.open(pck, usedb, mode) for g in good: print "Training ham (%s):" % g train(h, g, False) save = True for s in spam: print "Training spam (%s):" % s train(h, s, True) save = True if save: h.store() if do_filter: msg = sys.stdin.read() filtered = h.filter(msg) sys.stdout.write(filtered) if unknown: (spams, hams) = (0, 0) for u in unknown: if len(unknown) > 1: print "Scoring", u s, g = score(h, u, reverse) spams += s hams += g print "Total %d spam, %d ham" % (spams, hams) if __name__ == "__main__": main() --- NEW FILE: mboxutils.py --- #! /usr/bin/env python """Utilities for dealing with various types of mailboxes. This is mostly a wrapper around the various useful classes in the standard mailbox module, to do some intelligent guessing of the mailbox type given a mailbox argument. +foo -- MH mailbox +foo +foo,bar -- MH mailboxes +foo and +bar concatenated +ALL -- a shortcut for *all* MH mailboxes /foo/bar -- (existing file) a Unix-style mailbox /foo/bar/ -- (existing directory) a directory full of .txt and .lorien files /foo/bar/ -- (existing directory with a cur/ subdirectory) Maildir mailbox /foo/Mail/bar/ -- (existing directory with /Mail/ in its path) alternative way of spelling an MH mailbox """ from __future__ import generators import os import sys import glob import email import mailbox import email.Message import re class DirOfTxtFileMailbox: """Mailbox directory consisting of .txt and .lorien files.""" def __init__(self, dirname, factory): self.names = (glob.glob(os.path.join(dirname, "*.txt")) + glob.glob(os.path.join(dirname, "*.lorien"))) self.names.sort() self.factory = factory def __iter__(self): for name in self.names: try: f = open(name) except IOError: continue yield self.factory(f) f.close() def _cat(seqs): for seq in seqs: for item in seq: yield item def getmbox(name): """Return an mbox iterator given a file/directory/folder name.""" if name == "-": return [get_message(sys.stdin)] if name.startswith("+"): # MH folder name: +folder, +f1,f2,f2, or +ALL name = name[1:] import mhlib mh = mhlib.MH() if name == "ALL": names = mh.listfolders() elif ',' in name: names = name.split(',') else: names = [name] mboxes = [] mhpath = mh.getpath() for name in names: filename = os.path.join(mhpath, name) mbox = mailbox.MHMailbox(filename, get_message) mboxes.append(mbox) if len(mboxes) == 1: return iter(mboxes[0]) else: return _cat(mboxes) if os.path.isdir(name): # XXX Bogus: use a Maildir if /cur is a subdirectory, else a MHMailbox # if the pathname contains /Mail/, else a DirOfTxtFileMailbox. if os.path.exists(os.path.join(name, 'cur')): mbox = mailbox.Maildir(name, get_message) elif name.find("/Mail/") >= 0: mbox = mailbox.MHMailbox(name, get_message) else: mbox = DirOfTxtFileMailbox(name, get_message) else: fp = open(name, "rb") mbox = mailbox.PortableUnixMailbox(fp, get_message) return iter(mbox) def get_message(obj): """Return an email Message object. The argument may be a Message object already, in which case it's returned as-is. If the argument is a string or file-like object (supports read()), the email package is used to create a Message object from it. This can fail if the message is malformed. In that case, the headers (everything through the first blank line) are thrown out, and the rest of the text is wrapped in a bare email.Message.Message. """ if isinstance(obj, email.Message.Message): return obj # Create an email Message object. if hasattr(obj, "read"): obj = obj.read() try: msg = email.message_from_string(obj) except email.Errors.MessageParseError: # Wrap the raw text in a bare Message object. Since the # headers are most likely damaged, we can't use the email # package to parse them, so just get rid of them first. headers = extract_headers(obj) obj = obj[len(headers):] msg = email.Message.Message() msg.set_payload(obj) return msg header_break_re = re.compile(r"\r?\n(\r?\n)") def extract_headers(text): """Very simple-minded header extraction: prefix of text up to blank line. A blank line is recognized via two adjacent line-ending sequences, where a line-ending sequence is a newline optionally preceded by a carriage return. If no blank line is found, all of text is considered to be a potential header section. If a blank line is found, the text up to (but not including) the blank line is considered to be a potential header section. The potential header section is returned, unless it doesn't contain a colon, in which case an empty string is returned. >>> extract_headers("abc") '' >>> extract_headers("abc\\n\\n\\n") # no colon '' >>> extract_headers("abc: xyz\\n\\n\\n") 'abc: xyz\\n' >>> extract_headers("abc: xyz\\r\\n\\r\\n\\r\\n") 'abc: xyz\\r\\n' >>> extract_headers("a: b\\ngibberish\\n\\nmore gibberish") 'a: b\\ngibberish\\n' """ m = header_break_re.search(text) if m: eol = m.start(1) text = text[:eol] if ':' not in text: text = "" return text def _test(): import doctest, mboxutils return doctest.testmod(mboxutils) if __name__ == "__main__": _test() --- NEW FILE: msgs.py --- from __future__ import generators import os import random from spambayes.tokenizer import tokenize HAMTEST = None SPAMTEST = None HAMTRAIN = None SPAMTRAIN = None SEED = random.randrange(2000000000) class Msg(object): __slots__ = 'tag', 'guts' def __init__(self, dir, name): path = dir + "/" + name self.tag = path f = open(path, 'rb') self.guts = f.read() f.close() def __iter__(self): return tokenize(self.guts) # Compare msgs by their paths; this is appropriate for sets of msgs. def __hash__(self): return hash(self.tag) def __eq__(self, other): return self.tag == other.tag def __str__(self): return self.guts # The iterator yields a stream of Msg objects, taken from a list of # directories. class MsgStream(object): __slots__ = 'tag', 'directories', 'keep' def __init__(self, tag, directories, keep=None): self.tag = tag self.directories = directories self.keep = keep def __str__(self): return self.tag def produce(self): if self.keep is None: for directory in self.directories: for fname in os.listdir(directory): yield Msg(directory, fname) return # We only want part of the msgs. Shuffle each directory list, but # in such a way that we'll get the same result each time this is # called on the same directory list. for directory in self.directories: all = os.listdir(directory) random.seed(hash(max(all)) ^ SEED) # reproducible across calls random.shuffle(all) del all[self.keep:] all.sort() # seems to speed access on Win98! for fname in all: yield Msg(directory, fname) def __iter__(self): return self.produce() class HamStream(MsgStream): def __init__(self, tag, directories, train=0): if train: MsgStream.__init__(self, tag, directories, HAMTRAIN) else: MsgStream.__init__(self, tag, directories, HAMTEST) class SpamStream(MsgStream): def __init__(self, tag, directories, train=0): if train: MsgStream.__init__(self, tag, directories, SPAMTRAIN) else: MsgStream.__init__(self, tag, directories, SPAMTEST) def setparms(hamtrain, spamtrain, hamtest=None, spamtest=None, seed=None): """Set HAMTEST/TRAIN and SPAMTEST/TRAIN. If seed is not None, also set SEED. If (ham|spam)test are not set, set to the same as the (ham|spam)train numbers (backwards compat option). """ global HAMTEST, SPAMTEST, HAMTRAIN, SPAMTRAIN, SEED HAMTRAIN, SPAMTRAIN = hamtrain, spamtrain if hamtest is None: HAMTEST = HAMTRAIN else: HAMTEST = hamtest if spamtest is None: SPAMTEST = SPAMTRAIN else: SPAMTEST = spamtest if seed is not None: SEED = seed --- NEW FILE: optimize.py --- # __version__ = '$Id: optimize.py,v 1.1.2.1 2003/01/10 10:41:08 anthonybaxter Exp $' # # Optimize any parametric function. # import copy import Numeric def SimplexMaximize(var, err, func, convcrit = 0.001, minerr = 0.001): var = Numeric.array(var) simplex = [var] for i in range(len(var)): var2 = copy.copy(var) var2[i] = var[i] + err[i] simplex.append(var2) value = [] for i in range(len(simplex)): value.append(func(simplex[i])) while 1: # Determine worst and best wi = 0 bi = 0 for i in range(len(simplex)): if value[wi] > value[i]: wi = i if value[bi] < value[i]: bi = i # Test for convergence #print "worst, best are",wi,bi,"with",value[wi],value[bi] if abs(value[bi] - value[wi]) <= convcrit: return simplex[bi] # Calculate average of non-worst ave=Numeric.zeros(len(var), 'd') for i in range(len(simplex)): if i != wi: ave = ave + simplex[i] ave = ave / (len(simplex) - 1) worst = Numeric.array(simplex[wi]) # Check for too-small simplex simsize = Numeric.add.reduce(Numeric.absolute(ave - worst)) if simsize <= minerr: #print "Size of simplex too small:",simsize return simplex[bi] # Invert worst new = 2 * ave - simplex[wi] newv = func(new) if newv <= value[wi]: # Even worse. Shrink instead #print "Shrunk simplex" #print "ave=",repr(ave) #print "wi=",repr(worst) new = 0.5 * ave + 0.5 * worst newv = func(new) elif newv > value[bi]: # Better than the best. Expand new2 = 3 * ave - 2 * worst newv2 = func(new2) if newv2 > newv: # Accept #print "Expanded simplex" new = new2 newv = newv2 simplex[wi] = new value[wi] = newv def DoubleSimplexMaximize(var, err, func, convcrit=0.001, minerr=0.001): err = Numeric.array(err) var = SimplexMaximize(var, err, func, convcrit*5, minerr*5) return SimplexMaximize(var, 0.4 * err, func, convcrit, minerr) --- NEW FILE: storage.py --- #! /usr/bin/env python '''storage.py - Spambayes database management framework. Classes: PickledClassifier - Classifier that uses a pickle db DBDictClassifier - Classifier that uses a shelve db Trainer - Classifier training observer SpamTrainer - Trainer for spam HamTrainer - Trainer for ham Abstract: *Classifier are subclasses of Classifier (classifier.Classifier) that add automatic state store/restore function to the Classifier class. PickledClassifier is a Classifier class that uses a cPickle datastore. This database is relatively small, but slower than other databases. DBDictClassifier is a Classifier class that uses a database store. Trainer is concrete class that observes a Corpus and trains a Classifier object based upon movement of messages between corpora When an add message notification is received, the trainer trains the database with the message, as spam or ham as appropriate given the type of trainer (spam or ham). When a remove message notification is received, the trainer untrains the database as appropriate. SpamTrainer and HamTrainer are convenience subclasses of Trainer, that initialize as the appropriate type of Trainer To Do: o ZODBClassifier o Would Trainer.trainall really want to train with the whole corpus, or just a random subset? o Suggestions? ''' # This module is part of the spambayes project, which is Copyright 2002 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Neale Pickett , \ Tim Stone " __credits__ = "All the spambayes contributors." try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 def bool(val): return not not val from spambayes import classifier from spambayes.Options import options import cPickle as pickle import errno import shelve from spambayes import dbmstorage # Make shelve use binary pickles by default. oldShelvePickler = shelve.Pickler def binaryDefaultPickler(f, binary=1): return oldShelvePickler(f, binary) shelve.Pickler = binaryDefaultPickler PICKLE_TYPE = 1 NO_UPDATEPROBS = False # Probabilities will not be autoupdated with training UPDATEPROBS = True # Probabilities will be autoupdated with training class PickledClassifier(classifier.Classifier): '''Classifier object persisted in a pickle''' def __init__(self, db_name): classifier.Classifier.__init__(self) self.db_name = db_name self.load() def load(self): '''Load this instance from the pickle.''' # This is a bit strange, because the loading process # creates a temporary instance of PickledClassifier, from which # this object's state is copied. This is a nuance of the way # that pickle does its job if options.verbose: print 'Loading state from',self.db_name,'pickle' tempbayes = None try: fp = open(self.db_name, 'rb') except IOError, e: if e.errno != errno.ENOENT: raise else: tempbayes = pickle.load(fp) fp.close() # XXX: why not self.__setstate__(tempbayes.__getstate__())? if tempbayes: self.wordinfo = tempbayes.wordinfo self.nham = tempbayes.nham self.nspam = tempbayes.nspam if options.verbose: print '%s is an existing pickle, with %d ham and %d spam' \ % (self.db_name, self.nham, self.nspam) else: # new pickle if options.verbose: print self.db_name,'is a new pickle' self.wordinfo = {} self.nham = 0 self.nspam = 0 def store(self): '''Store self as a pickle''' if options.verbose: print 'Persisting',self.db_name,'as a pickle' fp = open(self.db_name, 'wb') pickle.dump(self, fp, PICKLE_TYPE) fp.close() class DBDictClassifier(classifier.Classifier): '''Classifier object persisted in a caching database''' def __init__(self, db_name, mode='c'): '''Constructor(database name)''' classifier.Classifier.__init__(self) self.wordcache = {} self.statekey = "saved state" self.mode = mode self.db_name = db_name self.load() def load(self): '''Load state from database''' if options.verbose: print 'Loading state from',self.db_name,'database' self.dbm = dbmstorage.open(self.db_name, self.mode) self.db = shelve.Shelf(self.dbm) if self.db.has_key(self.statekey): t = self.db[self.statekey] if t[0] != classifier.PICKLE_VERSION: raise ValueError("Can't unpickle -- version %s unknown" % t[0]) (self.nspam, self.nham) = t[1:] if options.verbose: print '%s is an existing database, with %d spam and %d ham' \ % (self.db_name, self.nspam, self.nham) else: # new database if options.verbose: print self.db_name,'is a new database' self.nspam = 0 self.nham = 0 self.wordinfo = {} def store(self): '''Place state into persistent store''' if options.verbose: print 'Persisting',self.db_name,'state in database' # Must use .keys() since we modify the dict in the loop for key in self.wordinfo.keys(): val = self.wordinfo[key] if val == None: del self.wordinfo[key] try: del self.db[key] except KeyError: pass else: self.db[key] = val.__getstate__() self.db[self.statekey] = (classifier.PICKLE_VERSION, self.nspam, self.nham) self.db.sync() def _wordinfoget(self, word): ret = self.wordinfo.get(word) if not ret: r = self.db.get(word) if r: ret = self.WordInfoClass() ret.__setstate__(r) self.wordinfo[word] = ret return ret # _wordinfoset is the same def _wordinfodel(self, word): self.wordinfo[word] = None class Trainer: '''Associates a Classifier object and one or more Corpora, \ is an observer of the corpora''' def __init__(self, bayes, is_spam, updateprobs=NO_UPDATEPROBS): '''Constructor(Classifier, is_spam(True|False), updprobs(True|False)''' self.bayes = bayes self.is_spam = is_spam self.updateprobs = updateprobs def onAddMessage(self, message): '''A message is being added to an observed corpus.''' self.train(message) def train(self, message): '''Train the database with the message''' if options.verbose: print 'training with',message.key() self.bayes.learn(message.tokenize(), self.is_spam) # self.updateprobs) def onRemoveMessage(self, message): '''A message is being removed from an observed corpus.''' self.untrain(message) def untrain(self, message): '''Untrain the database with the message''' if options.verbose: print 'untraining with',message.key() self.bayes.unlearn(message.tokenize(), self.is_spam) # self.updateprobs) # can raise ValueError if database is fouled. If this is the case, # then retraining is the only recovery option. def trainAll(self, corpus): '''Train all the messages in the corpus''' for msg in corpus: self.train(msg) def untrainAll(self, corpus): '''Untrain all the messages in the corpus''' for msg in corpus: self.untrain(msg) class SpamTrainer(Trainer): '''Trainer for spam''' def __init__(self, bayes, updateprobs=NO_UPDATEPROBS): '''Constructor''' Trainer.__init__(self, bayes, True, updateprobs) class HamTrainer(Trainer): '''Trainer for ham''' def __init__(self, bayes, updateprobs=NO_UPDATEPROBS): '''Constructor''' Trainer.__init__(self, bayes, False, updateprobs) if __name__ == '__main__': print >>sys.stderr, __doc__ --- NEW FILE: tokenizer.py --- #! /usr/bin/env python """Module to tokenize email messages for spam filtering.""" from __future__ import generators import email import email.Message import email.Header import email.Utils import email.Errors import re import math import time import os from sets import Set from spambayes.Options import options from spambayes.mboxutils import get_message [...1305 lines suppressed...] text, tokens = cracker(text) for t in tokens: yield t # Remove HTML/XML tags. Also  . text = text.replace(' ', ' ') text = html_re.sub(' ', text) # Tokenize everything in the body. for w in text.split(): n = len(w) # Make sure this range matches in tokenize_word(). if 3 <= n <= maxword: yield w elif n >= 3: for t in tokenize_word(w): yield t tokenize = Tokenizer().tokenize From anthonybaxter at users.sourceforge.net Sun Jan 12 19:25:58 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Sun Jan 12 22:26:00 2003 Subject: [Spambayes-checkins] website/pics gutter-hi.png,NONE,1.1 gutter.png,NONE,1.1 logo.png,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/website/pics In directory sc8-pr-cvs1:/tmp/cvs-serv30303 Added Files: gutter-hi.png gutter.png logo.png Log Message: cleanup of website lnf. --- NEW FILE: gutter-hi.png --- (This appears to be a binary file; contents omitted.) --- NEW FILE: gutter.png --- (This appears to be a binary file; contents omitted.) --- NEW FILE: logo.png --- (This appears to be a binary file; contents omitted.) From anthonybaxter at users.sourceforge.net Sun Jan 12 19:28:35 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Sun Jan 12 22:28:37 2003 Subject: [Spambayes-checkins] website/images - New directory Message-ID: Update of /cvsroot/spambayes/website/images In directory sc8-pr-cvs1:/tmp/cvs-serv31058/images Log Message: Directory /cvsroot/spambayes/website/images added to the repository From anthonybaxter at users.sourceforge.net Sun Jan 12 19:29:05 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Sun Jan 12 22:29:07 2003 Subject: [Spambayes-checkins] website/images banner.png,NONE,1.1 graham_graph.png,NONE,1.1 gutter-hi.png,NONE,1.1 gutter.png,NONE,1.1 logo.png,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/website/images In directory sc8-pr-cvs1:/tmp/cvs-serv31165/images Added Files: banner.png graham_graph.png gutter-hi.png gutter.png logo.png Log Message: 'pics' directory seems to have hosed perms --- NEW FILE: banner.png --- (This appears to be a binary file; contents omitted.) --- NEW FILE: graham_graph.png --- (This appears to be a binary file; contents omitted.) --- NEW FILE: gutter-hi.png --- (This appears to be a binary file; contents omitted.) --- NEW FILE: gutter.png --- (This appears to be a binary file; contents omitted.) --- NEW FILE: logo.png --- (This appears to be a binary file; contents omitted.) From anthonybaxter at users.sourceforge.net Sun Jan 12 19:58:00 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Sun Jan 12 22:58:04 2003 Subject: [Spambayes-checkins] website background.ht,1.3,1.4 index.ht,1.3,1.4 links.h,1.3,1.4 style.css,1.1.1.1,1.2 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv5318 Modified Files: background.ht index.ht links.h style.css Log Message: major re-working of the website to use style sheets, and generally look a bit nicer than the default python.org lnf. Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** background.ht 4 Nov 2002 09:57:59 -0000 1.3 --- background.ht 13 Jan 2003 03:57:58 -0000 1.4 *************** *** 1,4 **** --- 1,5 ---- Title: SpamBayes: Background Reading Author-Email: spambayes@python.org + Author: spambayes

    Background Reading

    Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** index.ht 30 Dec 2002 07:38:53 -0000 1.3 --- index.ht 13 Jan 2003 03:57:58 -0000 1.4 *************** *** 6,10 ****

    This project is developing a Bayesian anti-spam classifier, initially ! based on the work of Paul Graham.

    --- 6,10 ----

    This project is developing a Bayesian anti-spam classifier, initially ! based on the work of Paul Graham. A major difference between this project and many others that are doing similar work is the emphasis on testing and trialing newer and different approaches to scoring messages. While most projects are still working with the initial Graham approach, we found that a number of different approaches yielded a much more useful response. An attempt at documenting this is on the background page.

    *************** *** 20,23 **** --- 20,25 ---- Applications page.

    +

    We're currently working on packaging these up for end-user use - stay + tuned!

    Mailing list

    Index: links.h =================================================================== RCS file: /cvsroot/spambayes/website/links.h,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** links.h 1 Nov 2002 04:50:19 -0000 1.3 --- links.h 13 Jan 2003 03:57:58 -0000 1.4 *************** *** 1,3 **** !

    SpamBayes

  • Home page
  • Background --- 1,3 ---- !

    About the Project

  • Home page
  • Background Index: style.css =================================================================== RCS file: /cvsroot/spambayes/website/style.css,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -C2 -d -r1.1.1.1 -r1.2 *** style.css 19 Sep 2002 08:40:55 -0000 1.1.1.1 --- style.css 13 Jan 2003 03:57:58 -0000 1.2 *************** *** 5,8 **** --- 5,9 ---- color: #484848; margin-right: 15%; + font-family: geneva, verdana, arial, "ms sans serif", sans-serif; } *************** *** 21,26 **** TD:navtitle { color: #f4a560; font-size: xx-large; ! text-style: bold; } IMG { border: 0; } --- 22,38 ---- TD:navtitle { color: #f4a560; font-size: xx-large; ! font-weight: bold; } IMG { border: 0; } + + TABLE.sidebar { background-image: url("./images/gutter.png"); } + + TD.normalSidebar a:link { color: #222222; } + TD.normalSidebar a:visited { color: #333333; } + TD.normalSidebar a:active { color: #449944; } + TD.normalSidebar a:hover { color: #005500; } + + TD.headerSidebar { color: #222222; + font-weight: bold;} + From anthonybaxter at users.sourceforge.net Sun Jan 12 19:58:00 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Sun Jan 12 22:58:06 2003 Subject: [Spambayes-checkins] website/scripts/ht2html Sidebar.py,1.1.1.1,1.2 SpamBayesGenerator.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/website/scripts/ht2html In directory sc8-pr-cvs1:/tmp/cvs-serv5318/scripts/ht2html Modified Files: Sidebar.py SpamBayesGenerator.py Log Message: major re-working of the website to use style sheets, and generally look a bit nicer than the default python.org lnf. Index: Sidebar.py =================================================================== RCS file: /cvsroot/spambayes/website/scripts/ht2html/Sidebar.py,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -C2 -d -r1.1.1.1 -r1.2 *** Sidebar.py 19 Sep 2002 08:40:55 -0000 1.1.1.1 --- Sidebar.py 13 Jan 2003 03:57:58 -0000 1.2 *************** *** 52,56 **** print '' print '' % self.get_bgcolor() def __finish(self): --- 52,56 ---- print '' print '
    ' % self.get_bgcolor() def __finish(self): *************** *** 64,73 **** if done_one: # get some separation between header and last item ! print '' --- 64,71 ---- if done_one: # get some separation between header and last item ! print '' *************** *** 82,86 **** else: s = '%s' % (url, text) ! print '' --- 80,84 ---- else: s = '%s' % (url, text) ! print '' Index: SpamBayesGenerator.py =================================================================== RCS file: /cvsroot/spambayes/website/scripts/ht2html/SpamBayesGenerator.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** SpamBayesGenerator.py 19 Sep 2002 18:16:11 -0000 1.2 --- SpamBayesGenerator.py 13 Jan 2003 03:57:58 -0000 1.3 *************** *** 16,25 **** sitelinks = [ - ('%(rootdir)s/', 'Home'), ('http://sourceforge.net/projects/spambayes/', 'SF Project Page'), ] ! class SpamBayesGenerator(Skeleton, Sidebar, Banner): AUTHOR = 'spambayes@python.org' --- 16,25 ---- sitelinks = [ ('http://sourceforge.net/projects/spambayes/', 'SF Project Page'), ] + class SpamBayesSidebar(Sidebar): pass ! class SpamBayesGenerator(Skeleton, SpamBayesSidebar, Banner): AUTHOR = 'spambayes@python.org' *************** *** 43,47 **** ''' % self.__d)) self.__linkfixer.massage(p.sidebar, self.__d) ! Sidebar.__init__(self, p.sidebar) # # fix up our site links, no relthis because the site links are --- 43,47 ---- ''' % self.__d)) self.__linkfixer.massage(p.sidebar, self.__d) ! SpamBayesSidebar.__init__(self, p.sidebar) # # fix up our site links, no relthis because the site links are *************** *** 80,84 **** if self.__parser.get('wide-page', 'no').lower() == 'yes': return None ! return Sidebar.get_sidebar(self) def get_banner(self): --- 80,84 ---- if self.__parser.get('wide-page', 'no').lower() == 'yes': return None ! return SpamBayesSidebar.get_sidebar(self) def get_banner(self): *************** *** 94,98 ****
    !
    ''' % \ self.__d --- 94,98 ----
    !
    ''' % \ self.__d *************** *** 119,131 **** self.__body = text # python.org color scheme overrides def get_lightshade(self): ! return '#c7c7d7' def get_mediumshade(self): ! return '#867272' def get_darkshade(self): ! return '#635d5d' def get_charset(self): --- 119,138 ---- self.__body = text + def getSidebarNormalAttrs(self): + return 'class="normalSidebar" background="images/gutter.png"' + def getSidebarHeaderAttrs(self): + return 'class="headerSidebar" background="images/gutter-hi.png"' + # python.org color scheme overrides def get_lightshade(self): ! "used in sidebar normal items" ! return '' def get_mediumshade(self): ! return '' def get_darkshade(self): ! "used in sidebar header items" ! return '' def get_charset(self): From anthonybaxter at users.sourceforge.net Sun Jan 12 20:08:45 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Sun Jan 12 23:08:49 2003 Subject: [Spambayes-checkins] website/images chi2_graph.png,NONE,1.1 robinson_graph.png,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/website/images In directory sc8-pr-cvs1:/tmp/cvs-serv7859 Added Files: chi2_graph.png robinson_graph.png Log Message: some sample graphs. --- NEW FILE: chi2_graph.png --- (This appears to be a binary file; contents omitted.) --- NEW FILE: robinson_graph.png --- (This appears to be a binary file; contents omitted.) From anthonybaxter at users.sourceforge.net Sun Jan 12 20:44:58 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Sun Jan 12 23:45:00 2003 Subject: [Spambayes-checkins] website background.ht,1.4,1.5 style.css,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv17660 Modified Files: background.ht style.css Log Message: updated background with some sample plots. If someone in the set of (Tim, Gary, Rob) could review this and point out the obvious stupids, that would be good. (Or anyone else who understands the math...) Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** background.ht 13 Jan 2003 03:57:58 -0000 1.4 --- background.ht 13 Jan 2003 04:44:55 -0000 1.5 *************** *** 30,36 ****

    Combining and Scoring

    The next part of the system is the scoring and combining part. This is where the hairy mathematics and statistics come in.

    !

    Initially we started with Paul Graham's original combining scheme - this has a number of "magic numbers" and "fuzz factors" built into it. The Graham combining scheme has a number of problems, aside from the --- 30,38 ----

    Combining and Scoring

    +

    The next part of the system is the scoring and combining part. This is where the hairy mathematics and statistics come in.

    !

    Initially we started with Paul Graham's original combining scheme - ! a "Naive Bayes" scheme, of sorts - this has a number of "magic numbers" and "fuzz factors" built into it. The Graham combining scheme has a number of problems, aside from the *************** *** 38,41 **** --- 40,72 ---- either 1 or 0, and there's a very small middle ground in between - it doesn't often claim to be "unsure", and gets it wrong because of this. + The following plot shows the problem:

    +

    Note:In each of these plots, the X + axis shows the 'score' of the message, scaled to 0-100. + (where 0 is "definately ham", and 100 is "definately spam"), and + the Y axis shows the number of messages with that score (scaled + logarithmically). Note also that the plots aren't from the same data + set :-( but are "typical" plots that you'd see from a given technique. + One day, if I have enough time, I'll go back and re-do all the plots + using a single fixed data set.

    +

    +

    In this plot, you can see that most of the spam gets scores near to + 1, while most of the ham gets scores near to zero. This is all good. + Unfortunately, there's also a significant number of hams that score + close to 1, and spams close to 0 - which means the system is not just scoring + them wrong, but it's completely confident in it's (wrong) score. +

    +

    Add more here - cancellation disease, fudge factors, &c

    + +

    Gary Robinson's + essay turned up around this time, and Gary turned up on the mailing + list a short time after that.

    +

    Gary's initial technique produced scoring like this plot:

    + +

    This produces a very different result - but this plot shows + a different issue - there's a large overlap between the ham scores and + spam scores. Choosing the best 'cutoff' value for "everything higher than this + is spam" turned out to be an incredibly delicate operation, and was + highly dependent on the user's data.

    +

    There's a number of discussions back and forth between Tim Peters and Gary Robinson on this subject in the mailing list archives - I'll try *************** *** 43,54 ****

    Gary produced a number of alternative approaches to combining and scoring word probabilities. The initial one, after much back and forth ! in the mailing list, is in the code today as 'gary_combining'. A couple ! of other approaches, using the Central Limit Theorem, were also tried. They produced interesting output - but histograms of the ham and spam ! distributions had a disturbingly large overlap in the middle. There was also an issue with incremental training and untraining of messages that made it harder to use in the "real world". These two central limit approaches were dropped after Tim, Gary and Rob Hooft produced a combining ! scheme using chi-squared probabilities. This is now the default combining scheme.

    The chi-squared approach produces two numbers - a "ham probability" ("*H*") --- 74,88 ----

    Gary produced a number of alternative approaches to combining and scoring word probabilities. The initial one, after much back and forth ! in the mailing list, is in the code today as 'gary_combining', and is ! the second plot, above.. A couple ! of other approaches, using the Central Limit Theorem (or this, for the serious math geeks), were also tried.

    !

    todo: do some plots for these

    !

    They produced interesting output - but histograms of the ham and spam ! distributions still had a disturbingly large overlap in the middle. There was also an issue with incremental training and untraining of messages that made it harder to use in the "real world". These two central limit approaches were dropped after Tim, Gary and Rob Hooft produced a combining ! scheme using chi-squared probabilities. This is now the default combining scheme.

    The chi-squared approach produces two numbers - a "ham probability" ("*H*") *************** *** 61,70 **** basically that the message looks very much like ham, but also very much like spam. In this case spambayes is also unsure where the message ! should be classified, and the final score will be near 0.5.

    So at the end of the processing, you end up with three possible results - "Spam", "Ham", or "Unsure". It's possible to tweak the high and low cutoffs for the Unsure window - this trades off ! unsure messages vs possible false positives or negatives.

    Training

    --- 95,108 ---- basically that the message looks very much like ham, but also very much like spam. In this case spambayes is also unsure where the message ! should be classified, and the final score will be near 0.5. The following plot ! shows this quite clearly. It also shows the quite dramatic results we get ! from this technique.

    !

    So at the end of the processing, you end up with three possible results - "Spam", "Ham", or "Unsure". It's possible to tweak the high and low cutoffs for the Unsure window - this trades off ! unsure messages vs possible false positives or negatives. In the chi-squared ! results, the "unsure" window can be quite large, and still result in very small numbers of "unsure" messages.

    Training

    Index: style.css =================================================================== RCS file: /cvsroot/spambayes/website/style.css,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** style.css 13 Jan 2003 03:57:58 -0000 1.2 --- style.css 13 Jan 2003 04:44:55 -0000 1.3 *************** *** 13,16 **** --- 13,18 ---- P:footer { font-size: x-small; } + P.todo { font-style: italic ; } + P.note { font-style: italic ; margin-left: 5%; margin-right: 5%; } TABLE:navigation { width: 100%; From anthonybaxter at users.sourceforge.net Sun Jan 12 22:45:42 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Jan 13 01:45:45 2003 Subject: [Spambayes-checkins] spambayes/22compat heapq.py,1.1.2.1,NONE sets.py,1.1.2.1,NONE Message-ID: Update of /cvsroot/spambayes/spambayes/22compat In directory sc8-pr-cvs1:/tmp/cvs-serv2675/22compat Removed Files: Tag: reorg-branch heapq.py sets.py Log Message: distutils ate my brain. Rather than continue down the horrible horrible path of distutils madness, take a less nuanced approach to the sets.py and heapq.py modules. --- heapq.py DELETED --- --- sets.py DELETED --- From anthonybaxter at users.sourceforge.net Sun Jan 12 22:45:42 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Jan 13 01:45:47 2003 Subject: [Spambayes-checkins] spambayes setup.py,1.10.2.1,1.10.2.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv2675 Modified Files: Tag: reorg-branch setup.py Log Message: distutils ate my brain. Rather than continue down the horrible horrible path of distutils madness, take a less nuanced approach to the sets.py and heapq.py modules. Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.10.2.1 retrieving revision 1.10.2.2 diff -C2 -d -r1.10.2.1 -r1.10.2.2 *** setup.py 10 Jan 2003 10:41:06 -0000 1.10.2.1 --- setup.py 13 Jan 2003 06:45:40 -0000 1.10.2.2 *************** *** 7,16 **** sys.exit(0) - compat_mods = [] - try: - import sets, heapq - except ImportError: - compat_mods = [ '22compat/sets.py', '22compat/heapq.py', ] - # TODO. Figure distutils magic out here. setup( --- 7,10 ---- From anthonybaxter at users.sourceforge.net Sun Jan 12 22:45:42 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Jan 13 01:45:48 2003 Subject: [Spambayes-checkins] spambayes/pspam/pspam options.py,1.2,1.2.4.1 Message-ID: Update of /cvsroot/spambayes/spambayes/pspam/pspam In directory sc8-pr-cvs1:/tmp/cvs-serv2675/pspam/pspam Modified Files: Tag: reorg-branch options.py Log Message: distutils ate my brain. Rather than continue down the horrible horrible path of distutils madness, take a less nuanced approach to the sets.py and heapq.py modules. Index: options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pspam/pspam/options.py,v retrieving revision 1.2 retrieving revision 1.2.4.1 diff -C2 -d -r1.2 -r1.2.4.1 *** options.py 12 Nov 2002 07:03:20 -0000 1.2 --- options.py 13 Jan 2003 06:45:40 -0000 1.2.4.1 *************** *** 1,5 **** from Options import options, all_options, \ boolean_cracker, float_cracker, int_cracker, string_cracker ! from sets import Set all_options["Score"] = {'max_ham': float_cracker, --- 1,8 ---- from Options import options, all_options, \ boolean_cracker, float_cracker, int_cracker, string_cracker ! try: ! from sets import Set ! except ImportError: ! from spambayes.compatsets import Set all_options["Score"] = {'max_ham': float_cracker, From anthonybaxter at users.sourceforge.net Sun Jan 12 22:45:42 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Jan 13 01:45:51 2003 Subject: [Spambayes-checkins] spambayes/testtools mboxtest.py,1.1.2.1,1.1.2.2 Message-ID: Update of /cvsroot/spambayes/spambayes/testtools In directory sc8-pr-cvs1:/tmp/cvs-serv2675/testtools Modified Files: Tag: reorg-branch mboxtest.py Log Message: distutils ate my brain. Rather than continue down the horrible horrible path of distutils madness, take a less nuanced approach to the sets.py and heapq.py modules. Index: mboxtest.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/testtools/Attic/mboxtest.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** mboxtest.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- mboxtest.py 13 Jan 2003 06:45:40 -0000 1.1.2.2 *************** *** 25,29 **** import random import re ! from sets import Set import sys --- 25,32 ---- import random import re ! try: ! from sets import Set ! except ImportError: ! from spambayes.compatsets import Set import sys From anthonybaxter at users.sourceforge.net Sun Jan 12 22:45:42 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Jan 13 01:45:53 2003 Subject: [Spambayes-checkins] spambayes/spambayes compatheapq.py,NONE,1.1.2.1 compatsets.py,NONE,1.1.2.1 Options.py,1.1.2.1,1.1.2.2 TestDriver.py,1.1.2.1,1.1.2.2 classifier.py,1.1.2.1,1.1.2.2 tokenizer.py,1.1.2.1,1.1.2.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv2675/spambayes Modified Files: Tag: reorg-branch Options.py TestDriver.py classifier.py tokenizer.py Added Files: Tag: reorg-branch compatheapq.py compatsets.py Log Message: distutils ate my brain. Rather than continue down the horrible horrible path of distutils madness, take a less nuanced approach to the sets.py and heapq.py modules. --- NEW FILE: compatheapq.py --- # -*- coding: Latin-1 -*- """Heap queue algorithm (a.k.a. priority queue). Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for all k, counting elements from 0. For the sake of comparison, non-existing elements are considered to be infinite. The interesting property of a heap is that a[0] is always its smallest element. Usage: heap = [] # creates an empty heap heappush(heap, item) # pushes a new item on the heap item = heappop(heap) # pops the smallest item from the heap item = heap[0] # smallest item on the heap without popping it heapify(x) # transforms list into a heap, in-place, in linear time item = heapreplace(heap, item) # pops and returns smallest item, and adds # new item; the heap size is unchanged Our API differs from textbook heap algorithms as follows: - We use 0-based indexing. This makes the relationship between the index for a node and the indexes for its children slightly less obvious, but is more suitable since Python uses 0-based indexing. - Our heappop() method returns the smallest item, not the largest. These two make it possible to view the heap as a regular Python list without surprises: heap[0] is the smallest item, and heap.sort() maintains the heap invariant! """ # Original code by Kevin O'Connor, augmented by Tim Peters __about__ = """Heap queues [explanation by François Pinard] Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for all k, counting elements from 0. For the sake of comparison, non-existing elements are considered to be infinite. The interesting property of a heap is that a[0] is always its smallest element. The strange invariant above is meant to be an efficient memory representation for a tournament. The numbers below are `k', not a[k]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 In the tree above, each cell `k' is topping `2*k+1' and `2*k+2'. In an usual binary tournament we see in sports, each cell is the winner over the two cells it tops, and we can trace the winner down the tree to see all opponents s/he had. However, in many computer applications of such tournaments, we do not need to trace the history of a winner. To be more memory efficient, when a winner is promoted, we try to replace it by something else at a lower level, and the rule becomes that a cell and the two cells it tops contain three different items, but the top cell "wins" over the two topped cells. If this heap invariant is protected at all time, index 0 is clearly the overall winner. The simplest algorithmic way to remove it and find the "next" winner is to move some loser (let's say cell 30 in the diagram above) into the 0 position, and then percolate this new 0 down the tree, exchanging values, until the invariant is re-established. This is clearly logarithmic on the total number of items in the tree. By iterating over all items, you get an O(n ln n) sort. A nice feature of this sort is that you can efficiently insert new items while the sort is going on, provided that the inserted items are not "better" than the last 0'th element you extracted. This is especially useful in simulation contexts, where the tree holds all incoming events, and the "win" condition means the smallest scheduled time. When an event schedule other events for execution, they are scheduled into the future, so they can easily go into the heap. So, a heap is a good structure for implementing schedulers (this is what I used for my MIDI sequencer :-). Various structures for implementing schedulers have been extensively studied, and heaps are good for this, as they are reasonably speedy, the speed is almost constant, and the worst case is not much different than the average case. However, there are other representations which are more efficient overall, yet the worst cases might be terrible. Heaps are also very useful in big disk sorts. You most probably all know that a big sort implies producing "runs" (which are pre-sorted sequences, which size is usually related to the amount of CPU memory), followed by a merging passes for these runs, which merging is often very cleverly organised[1]. It is very important that the initial sort produces the longest runs possible. Tournaments are a good way to that. If, using all the memory available to hold a tournament, you replace and percolate items that happen to fit the current run, you'll produce runs which are twice the size of the memory for random input, and much better for input fuzzily ordered. Moreover, if you output the 0'th item on disk and get an input which may not fit in the current tournament (because the value "wins" over the last output value), it cannot fit in the heap, so the size of the heap decreases. The freed memory could be cleverly reused immediately for progressively building a second heap, which grows at exactly the same rate the first heap is melting. When the first heap completely vanishes, you switch heaps and start a new run. Clever and quite effective! In a word, heaps are useful memory structures to know. I use them in a few applications, and I think it is good to keep a `heap' module around. :-) -------------------- [1] The disk balancing algorithms which are current, nowadays, are more annoying than clever, and this is a consequence of the seeking capabilities of the disks. On devices which cannot seek, like big tape drives, the story was quite different, and one had to be very clever to ensure (far in advance) that each tape movement will be the most effective possible (that is, will best participate at "progressing" the merge). Some tapes were even able to read backwards, and this was also used to avoid the rewinding time. Believe me, real good tape sorts were quite spectacular to watch! >From all times, sorting has always been a Great Art! :-) """ def heappush(heap, item): """Push item onto heap, maintaining the heap invariant.""" heap.append(item) _siftdown(heap, 0, len(heap)-1) def heappop(heap): """Pop the smallest item off the heap, maintaining the heap invariant.""" lastelt = heap.pop() # raises appropriate IndexError if heap is empty if heap: returnitem = heap[0] heap[0] = lastelt _siftup(heap, 0) else: returnitem = lastelt return returnitem def heapreplace(heap, item): """Pop and return the current smallest value, and add the new item. This is more efficient than heappop() followed by heappush(), and can be more appropriate when using a fixed-size heap. Note that the value returned may be larger than item! That constrains reasonable uses of this routine. """ returnitem = heap[0] # raises appropriate IndexError if heap is empty heap[0] = item _siftup(heap, 0) return returnitem def heapify(x): """Transform list into a heap, in-place, in O(len(heap)) time.""" n = len(x) # Transform bottom-up. The largest index there's any point to looking at # is the largest with a child index in-range, so must have 2*i + 1 < n, # or i < (n-1)/2. If n is even = 2*j, this is (2*j-1)/2 = j-1/2 so # j-1 is the largest, which is n//2 - 1. If n is odd = 2*j+1, this is # (2*j+1-1)/2 = j so j-1 is the largest, and that's again n//2-1. for i in xrange(n//2 - 1, -1, -1): _siftup(x, i) # 'heap' is a heap at all indices >= startpos, except possibly for pos. pos # is the index of a leaf with a possibly out-of-order value. Restore the # heap invariant. def _siftdown(heap, startpos, pos): newitem = heap[pos] # Follow the path to the root, moving parents down until finding a place # newitem fits. while pos > startpos: parentpos = (pos - 1) >> 1 parent = heap[parentpos] if parent <= newitem: break heap[pos] = parent pos = parentpos heap[pos] = newitem # The child indices of heap index pos are already heaps, and we want to make # a heap at index pos too. We do this by bubbling the smaller child of # pos up (and so on with that child's children, etc) until hitting a leaf, # then using _siftdown to move the oddball originally at index pos into place. # # We *could* break out of the loop as soon as we find a pos where newitem <= # both its children, but turns out that's not a good idea, and despite that # many books write the algorithm that way. During a heap pop, the last array # element is sifted in, and that tends to be large, so that comparing it # against values starting from the root usually doesn't pay (= usually doesn't # get us out of the loop early). See Knuth, Volume 3, where this is # explained and quantified in an exercise. # # Cutting the # of comparisons is important, since these routines have no # way to extract "the priority" from an array element, so that intelligence # is likely to be hiding in custom __cmp__ methods, or in array elements # storing (priority, record) tuples. Comparisons are thus potentially # expensive. # # On random arrays of length 1000, making this change cut the number of # comparisons made by heapify() a little, and those made by exhaustive # heappop() a lot, in accord with theory. Here are typical results from 3 # runs (3 just to demonstrate how small the variance is): # # Compares needed by heapify Compares needed by 1000 heapppops # -------------------------- --------------------------------- # 1837 cut to 1663 14996 cut to 8680 # 1855 cut to 1659 14966 cut to 8678 # 1847 cut to 1660 15024 cut to 8703 # # Building the heap by using heappush() 1000 times instead required # 2198, 2148, and 2219 compares: heapify() is more efficient, when # you can use it. # # The total compares needed by list.sort() on the same lists were 8627, # 8627, and 8632 (this should be compared to the sum of heapify() and # heappop() compares): list.sort() is (unsurprisingly!) more efficient # for sorting. def _siftup(heap, pos): endpos = len(heap) startpos = pos newitem = heap[pos] # Bubble up the smaller child until hitting a leaf. childpos = 2*pos + 1 # leftmost child position while childpos < endpos: # Set childpos to index of smaller child. rightpos = childpos + 1 if rightpos < endpos and heap[rightpos] <= heap[childpos]: childpos = rightpos # Move the smaller child up. heap[pos] = heap[childpos] pos = childpos childpos = 2*pos + 1 # The leaf at pos is empty now. Put newitem there, and and bubble it up # to its final resting place (by sifting its parents down). heap[pos] = newitem _siftdown(heap, startpos, pos) if __name__ == "__main__": # Simple sanity test heap = [] data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 0] for item in data: heappush(heap, item) sort = [] while heap: sort.append(heappop(heap)) print sort --- NEW FILE: compatsets.py --- """Classes to represent arbitrary sets (including sets of sets). This module implements sets using dictionaries whose values are ignored. The usual operations (union, intersection, deletion, etc.) are provided as both methods and operators. Important: sets are not sequences! While they support 'x in s', 'len(s)', and 'for x in s', none of those operations are unique for sequences; for example, mappings support all three as well. The characteristic operation for sequences is subscripting with small integers: s[i], for i in range(len(s)). Sets don't support subscripting at all. Also, sequences allow multiple occurrences and their elements have a definite order; sets on the other hand don't record multiple occurrences and don't remember the order of element insertion (which is why they don't support s[i]). The following classes are provided: BaseSet -- All the operations common to both mutable and immutable sets. This is an abstract class, not meant to be directly instantiated. Set -- Mutable sets, subclass of BaseSet; not hashable. ImmutableSet -- Immutable sets, subclass of BaseSet; hashable. An iterable argument is mandatory to create an ImmutableSet. _TemporarilyImmutableSet -- Not a subclass of BaseSet: just a wrapper around a Set, hashable, giving the same hash value as the immutable set equivalent would have. Do not use this class directly. Only hashable objects can be added to a Set. In particular, you cannot really add a Set as an element to another Set; if you try, what is actually added is an ImmutableSet built from it (it compares equal to the one you tried adding). When you ask if `x in y' where x is a Set and y is a Set or ImmutableSet, x is wrapped into a _TemporarilyImmutableSet z, and what's tested is actually `z in y'. """ # Code history: # # - Greg V. Wilson wrote the first version, using a different approach # to the mutable/immutable problem, and inheriting from dict. # # - Alex Martelli modified Greg's version to implement the current # Set/ImmutableSet approach, and make the data an attribute. # # - Guido van Rossum rewrote much of the code, made some API changes, # and cleaned up the docstrings. # # - Raymond Hettinger added a number of speedups and other # improvements. __all__ = ['BaseSet', 'Set', 'ImmutableSet'] try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 class BaseSet(object): """Common base class for mutable and immutable sets.""" __slots__ = ['_data'] # Constructor def __init__(self): """This is an abstract class.""" # Don't call this from a concrete subclass! if self.__class__ is BaseSet: raise TypeError, ("BaseSet is an abstract class. " "Use Set or ImmutableSet.") # Standard protocols: __len__, __repr__, __str__, __iter__ def __len__(self): """Return the number of elements of a set.""" return len(self._data) def __repr__(self): """Return string representation of a set. This looks like 'Set([])'. """ return self._repr() # __str__ is the same as __repr__ __str__ = __repr__ def _repr(self, sorted=False): elements = self._data.keys() if sorted: elements.sort() return '%s(%r)' % (self.__class__.__name__, elements) def __iter__(self): """Return an iterator over the elements or a set. This is the keys iterator for the underlying dict. """ return self._data.iterkeys() # Equality comparisons using the underlying dicts def __eq__(self, other): self._binary_sanity_check(other) return self._data == other._data def __ne__(self, other): self._binary_sanity_check(other) return self._data != other._data # Copying operations def copy(self): """Return a shallow copy of a set.""" result = self.__class__() result._data.update(self._data) return result __copy__ = copy # For the copy module def __deepcopy__(self, memo): """Return a deep copy of a set; used by copy module.""" # This pre-creates the result and inserts it in the memo # early, in case the deep copy recurses into another reference # to this same set. A set can't be an element of itself, but # it can certainly contain an object that has a reference to # itself. from copy import deepcopy result = self.__class__() memo[id(self)] = result data = result._data value = True for elt in self: data[deepcopy(elt, memo)] = value return result # Standard set operations: union, intersection, both differences. # Each has an operator version (e.g. __or__, invoked with |) and a # method version (e.g. union). # Subtle: Each pair requires distinct code so that the outcome is # correct when the type of other isn't suitable. For example, if # we did "union = __or__" instead, then Set().union(3) would return # NotImplemented instead of raising TypeError (albeit that *why* it # raises TypeError as-is is also a bit subtle). def __or__(self, other): """Return the union of two sets as a new set. (I.e. all elements that are in either set.) """ if not isinstance(other, BaseSet): return NotImplemented result = self.__class__() result._data = self._data.copy() result._data.update(other._data) return result def union(self, other): """Return the union of two sets as a new set. (I.e. all elements that are in either set.) """ return self | other def __and__(self, other): """Return the intersection of two sets as a new set. (I.e. all elements that are in both sets.) """ if not isinstance(other, BaseSet): return NotImplemented if len(self) <= len(other): little, big = self, other else: little, big = other, self common = filter(big._data.has_key, little._data.iterkeys()) return self.__class__(common) def intersection(self, other): """Return the intersection of two sets as a new set. (I.e. all elements that are in both sets.) """ return self & other def __xor__(self, other): """Return the symmetric difference of two sets as a new set. (I.e. all elements that are in exactly one of the sets.) """ if not isinstance(other, BaseSet): return NotImplemented result = self.__class__() data = result._data value = True selfdata = self._data otherdata = other._data for elt in selfdata: if elt not in otherdata: data[elt] = value for elt in otherdata: if elt not in selfdata: data[elt] = value return result def symmetric_difference(self, other): """Return the symmetric difference of two sets as a new set. (I.e. all elements that are in exactly one of the sets.) """ return self ^ other def __sub__(self, other): """Return the difference of two sets as a new Set. (I.e. all elements that are in this set and not in the other.) """ if not isinstance(other, BaseSet): return NotImplemented result = self.__class__() data = result._data otherdata = other._data value = True for elt in self: if elt not in otherdata: data[elt] = value return result def difference(self, other): """Return the difference of two sets as a new Set. (I.e. all elements that are in this set and not in the other.) """ return self - other # Membership test def __contains__(self, element): """Report whether an element is a member of a set. (Called in response to the expression `element in self'.) """ try: return element in self._data except TypeError: transform = getattr(element, "_as_temporarily_immutable", None) if transform is None: raise # re-raise the TypeError exception we caught return transform() in self._data # Subset and superset test def issubset(self, other): """Report whether another set contains this set.""" self._binary_sanity_check(other) if len(self) > len(other): # Fast check for obvious cases return False otherdata = other._data for elt in self: if elt not in otherdata: return False return True def issuperset(self, other): """Report whether this set contains another set.""" self._binary_sanity_check(other) if len(self) < len(other): # Fast check for obvious cases return False selfdata = self._data for elt in other: if elt not in selfdata: return False return True # Inequality comparisons using the is-subset relation. __le__ = issubset __ge__ = issuperset def __lt__(self, other): self._binary_sanity_check(other) return len(self) < len(other) and self.issubset(other) def __gt__(self, other): self._binary_sanity_check(other) return len(self) > len(other) and self.issuperset(other) # Assorted helpers def _binary_sanity_check(self, other): # Check that the other argument to a binary operation is also # a set, raising a TypeError otherwise. if not isinstance(other, BaseSet): raise TypeError, "Binary operation only permitted between sets" def _compute_hash(self): # Calculate hash code for a set by xor'ing the hash codes of # the elements. This ensures that the hash code does not depend # on the order in which elements are added to the set. This is # not called __hash__ because a BaseSet should not be hashable; # only an ImmutableSet is hashable. result = 0 for elt in self: result ^= hash(elt) return result def _update(self, iterable): # The main loop for update() and the subclass __init__() methods. data = self._data # Use the fast update() method when a dictionary is available. if isinstance(iterable, BaseSet): data.update(iterable._data) return if isinstance(iterable, dict): data.update(iterable) return value = True it = iter(iterable) while True: try: for element in it: data[element] = value return except TypeError: transform = getattr(element, "_as_immutable", None) if transform is None: raise # re-raise the TypeError exception we caught data[transform()] = value class ImmutableSet(BaseSet): """Immutable set class.""" __slots__ = ['_hashcode'] # BaseSet + hashing def __init__(self, iterable=None): """Construct an immutable set from an optional iterable.""" self._hashcode = None self._data = {} if iterable is not None: self._update(iterable) def __hash__(self): if self._hashcode is None: self._hashcode = self._compute_hash() return self._hashcode class Set(BaseSet): """ Mutable set class.""" __slots__ = [] # BaseSet + operations requiring mutability; no hashing def __init__(self, iterable=None): """Construct a set from an optional iterable.""" self._data = {} if iterable is not None: self._update(iterable) def __hash__(self): """A Set cannot be hashed.""" # We inherit object.__hash__, so we must deny this explicitly raise TypeError, "Can't hash a Set, only an ImmutableSet." # In-place union, intersection, differences. # Subtle: The xyz_update() functions deliberately return None, # as do all mutating operations on built-in container types. # The __xyz__ spellings have to return self, though. def __ior__(self, other): """Update a set with the union of itself and another.""" self._binary_sanity_check(other) self._data.update(other._data) return self def union_update(self, other): """Update a set with the union of itself and another.""" self |= other def __iand__(self, other): """Update a set with the intersection of itself and another.""" self._binary_sanity_check(other) self._data = (self & other)._data return self def intersection_update(self, other): """Update a set with the intersection of itself and another.""" self &= other def __ixor__(self, other): """Update a set with the symmetric difference of itself and another.""" self._binary_sanity_check(other) data = self._data value = True for elt in other: if elt in data: del data[elt] else: data[elt] = value return self def symmetric_difference_update(self, other): """Update a set with the symmetric difference of itself and another.""" self ^= other def __isub__(self, other): """Remove all elements of another set from this set.""" self._binary_sanity_check(other) data = self._data for elt in other: if elt in data: del data[elt] return self def difference_update(self, other): """Remove all elements of another set from this set.""" self -= other # Python dict-like mass mutations: update, clear def update(self, iterable): """Add all values from an iterable (such as a list or file).""" self._update(iterable) def clear(self): """Remove all elements from this set.""" self._data.clear() # Single-element mutations: add, remove, discard def add(self, element): """Add an element to a set. This has no effect if the element is already present. """ try: self._data[element] = True except TypeError: transform = getattr(element, "_as_immutable", None) if transform is None: raise # re-raise the TypeError exception we caught self._data[transform()] = True def remove(self, element): """Remove an element from a set; it must be a member. If the element is not a member, raise a KeyError. """ try: del self._data[element] except TypeError: transform = getattr(element, "_as_temporarily_immutable", None) if transform is None: raise # re-raise the TypeError exception we caught del self._data[transform()] def discard(self, element): """Remove an element from a set if it is a member. If the element is not a member, do nothing. """ try: self.remove(element) except KeyError: pass def pop(self): """Remove and return an arbitrary set element.""" return self._data.popitem()[0] def _as_immutable(self): # Return a copy of self as an immutable set return ImmutableSet(self) def _as_temporarily_immutable(self): # Return self wrapped in a temporarily immutable set return _TemporarilyImmutableSet(self) class _TemporarilyImmutableSet(BaseSet): # Wrap a mutable set as if it was temporarily immutable. # This only supplies hashing and equality comparisons. def __init__(self, set): self._set = set self._data = set._data # Needed by ImmutableSet.__eq__() def __hash__(self): return self._set._compute_hash() Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Attic/Options.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** Options.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- Options.py 13 Jan 2003 06:45:40 -0000 1.1.2.2 *************** *** 7,11 **** import StringIO import ConfigParser ! from sets import Set try: --- 7,15 ---- import StringIO import ConfigParser ! try: ! from sets import Set ! except ImportError: ! from spambayes.compatsets import Set ! try: Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Attic/TestDriver.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** TestDriver.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- TestDriver.py 13 Jan 2003 06:45:40 -0000 1.1.2.2 *************** *** 22,28 **** # alldone() ! from sets import Set import cPickle as pickle ! from heapq import heapreplace from spambayes.Options import options --- 22,36 ---- # alldone() ! try: ! from sets import Set ! except ImportError: ! from spambayes.compatsets import Set ! import cPickle as pickle ! ! try: ! from heapq import heapreplace ! except ImportError: ! from spambayes.compatheapq import heapreplace from spambayes.Options import options Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Attic/classifier.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** classifier.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- classifier.py 13 Jan 2003 06:45:40 -0000 1.1.2.2 *************** *** 33,37 **** import math ! from sets import Set from spambayes.Options import options --- 33,40 ---- import math ! try: ! from sets import Set ! except ImportError: ! from spambayes.compatsets import Set from spambayes.Options import options Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Attic/tokenizer.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** tokenizer.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- tokenizer.py 13 Jan 2003 06:45:40 -0000 1.1.2.2 *************** *** 13,17 **** import time import os ! from sets import Set from spambayes.Options import options --- 13,21 ---- import time import os ! try: ! from sets import Set ! except ImportError: ! from spambayes.compatsets import Set ! from spambayes.Options import options From anthonybaxter at users.sourceforge.net Sun Jan 12 22:48:23 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Jan 13 01:48:25 2003 Subject: [Spambayes-checkins] spambayes/utilities HistToGNU.py,1.1.2.1,1.1.2.2 mboxcount.py,1.1.2.1,1.1.2.2 split.py,1.1.2.1,1.1.2.2 splitn.py,1.1.2.1,1.1.2.2 splitndirs.py,1.1.2.1,1.1.2.2 Message-ID: Update of /cvsroot/spambayes/spambayes/utilities In directory sc8-pr-cvs1:/tmp/cvs-serv3761 Modified Files: Tag: reorg-branch HistToGNU.py mboxcount.py split.py splitn.py splitndirs.py Log Message: import cleanups. Index: HistToGNU.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/utilities/Attic/HistToGNU.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** HistToGNU.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- HistToGNU.py 13 Jan 2003 06:48:20 -0000 1.1.2.2 *************** *** 21,26 **** dataSetOptions="smooth unique" ! from Options import options ! from TestDriver import Hist import sys --- 21,26 ---- dataSetOptions="smooth unique" ! from spambayes.Options import options ! from spambayes.TestDriver import Hist import sys Index: mboxcount.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/utilities/Attic/mboxcount.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** mboxcount.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- mboxcount.py 13 Jan 2003 06:48:21 -0000 1.1.2.2 *************** *** 34,38 **** import glob ! from mboxutils import get_message try: --- 34,38 ---- import glob ! from spambayes.mboxutils import get_message try: Index: split.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/utilities/Attic/split.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** split.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- split.py 13 Jan 2003 06:48:21 -0000 1.1.2.2 *************** *** 32,36 **** import getopt ! import mboxutils program = sys.argv[0] --- 32,36 ---- import getopt ! from spambayes import mboxutils program = sys.argv[0] Index: splitn.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/utilities/Attic/splitn.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** splitn.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- splitn.py 13 Jan 2003 06:48:21 -0000 1.1.2.2 *************** *** 46,50 **** import getopt ! import mboxutils try: --- 46,50 ---- import getopt ! from spambayes import mboxutils try: Index: splitndirs.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/utilities/Attic/splitndirs.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** splitndirs.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- splitndirs.py 13 Jan 2003 06:48:21 -0000 1.1.2.2 *************** *** 53,57 **** import glob ! import mboxutils try: --- 53,57 ---- import glob ! from spambayes import mboxutils try: From anthonybaxter at users.sourceforge.net Sun Jan 12 23:55:29 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Jan 13 02:55:32 2003 Subject: [Spambayes-checkins] website background.ht,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv31450 Modified Files: background.ht Log Message: some added verbiage. Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** background.ht 13 Jan 2003 04:44:55 -0000 1.5 --- background.ht 13 Jan 2003 07:55:26 -0000 1.6 *************** *** 51,58 ****

    In this plot, you can see that most of the spam gets scores near to ! 1, while most of the ham gets scores near to zero. This is all good. Unfortunately, there's also a significant number of hams that score ! close to 1, and spams close to 0 - which means the system is not just scoring ! them wrong, but it's completely confident in it's (wrong) score.

    Add more here - cancellation disease, fudge factors, &c

    --- 51,58 ----

    In this plot, you can see that most of the spam gets scores near to ! 100, while most of the ham gets scores near to 0. This is all good. Unfortunately, there's also a significant number of hams that score ! close to 100, and spams close to 0 - which means the system is not just scoring ! them wrong, but it's completely confident in its (wrong) score. (Note that the difference isn't as apparent as it could be - it's a logarithmic scale graph!)

    Add more here - cancellation disease, fudge factors, &c

    From anthonybaxter at users.sourceforge.net Mon Jan 13 00:06:21 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Jan 13 03:06:24 2003 Subject: [Spambayes-checkins] website background.ht,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv2707 Modified Files: background.ht Log Message: copied in gary's note about bayesianess, because the first two people I asked to review the page both said "graham's approach isn't bayesian" Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** background.ht 13 Jan 2003 07:55:26 -0000 1.6 --- background.ht 13 Jan 2003 08:06:18 -0000 1.7 *************** *** 34,40 **** is where the hairy mathematics and statistics come in.

    Initially we started with Paul Graham's original combining scheme - ! a "Naive Bayes" scheme, of sorts - this has a number of "magic numbers" and "fuzz factors" built into it. ! The Graham combining scheme has a number of problems, aside from the magic in the internal fudge factors - it tends to produce scores of either 1 or 0, and there's a very small middle ground in between - it --- 34,46 ---- is where the hairy mathematics and statistics come in.

    Initially we started with Paul Graham's original combining scheme - ! a "Naive Bayes" scheme, of sorts - this has a number of "magic numbers" and "fuzz factors" built into it. !

    Gary's essay, linked above, has this to say on the 'Bayesianess' ! of the original Graham scheme:
    ! ! Paul's approach has become fairly famous for filtering spam in a Bayesian way. That's only true if we make fairly large leaps of the imagination. Originally after reading his issay I thought that it was in no way Bayesian, but I have since noticed that if and only if a particular less-than-optimal assumption is made, part of it could be viewed as Bayesian through a very obscure argument. But it's a pretty remote case for Bayesianness. In any case, there's no need to dwell on Bayesianness or non-Bayesianness; we have bigger fish to fry. (Note: Tim Peters of spambayes fame has posted another way of looking at Paul's approach as Bayesian, although to do so he needs to make the unrealistic assumption that spams and non-spams are equally likely.) !

    ! !

    The Graham combining scheme has a number of problems, aside from the magic in the internal fudge factors - it tends to produce scores of either 1 or 0, and there's a very small middle ground in between - it From anthonybaxter at users.sourceforge.net Mon Jan 13 00:09:01 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Jan 13 03:09:03 2003 Subject: [Spambayes-checkins] website related.ht,1.6,1.7 background.ht,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv3401 Modified Files: related.ht background.ht Log Message: change to a clickable (and harvestable) email address. I fear no spammer :) Index: related.ht =================================================================== RCS file: /cvsroot/spambayes/website/related.ht,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** related.ht 9 Jan 2003 02:13:21 -0000 1.6 --- related.ht 13 Jan 2003 08:08:58 -0000 1.7 *************** *** 18,20 **** !

    (got more? email anthony at interlink.com.au and I'll add links, or correct descriptions.) --- 18,20 ---- !

    (got more? email anthony@interlink.com.au and I'll add links, or correct descriptions.) Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** background.ht 13 Jan 2003 08:06:18 -0000 1.7 --- background.ht 13 Jan 2003 08:08:58 -0000 1.8 *************** *** 14,18 **** suggesting some improvements to Graham's original approach. !

    more links? mail anthony at interlink.com.au

    Overall Approach

    --- 14,19 ---- suggesting some improvements to Graham's original approach. !

    more links? mail ! anthony@interlink.com.au

    Overall Approach

    From anthonybaxter at users.sourceforge.net Mon Jan 13 21:34:58 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 14 00:35:04 2003 Subject: [Spambayes-checkins] spambayes/spambayes .cvsignore,NONE,1.1.2.1 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv32276 Added Files: Tag: reorg-branch .cvsignore Log Message: --- NEW FILE: .cvsignore --- *.pyc From anthonybaxter at users.sourceforge.net Mon Jan 13 21:35:22 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 14 00:35:25 2003 Subject: [Spambayes-checkins] spambayes/spambayes __init__.py,1.1.2.1,1.1.2.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv32378/spambayes Modified Files: Tag: reorg-branch __init__.py Log Message: final bits of setup.py tweaking. Index: __init__.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Attic/__init__.py,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** __init__.py 10 Jan 2003 10:41:08 -0000 1.1.2.1 --- __init__.py 14 Jan 2003 05:35:19 -0000 1.1.2.2 *************** *** 1 **** --- 1,3 ---- # package marker. + + __version__ = '1.0a0' From anthonybaxter at users.sourceforge.net Mon Jan 13 21:35:21 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 14 00:35:26 2003 Subject: [Spambayes-checkins] spambayes setup.py,1.10.2.2,1.10.2.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv32378 Modified Files: Tag: reorg-branch setup.py Log Message: final bits of setup.py tweaking. Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.10.2.2 retrieving revision 1.10.2.3 diff -C2 -d -r1.10.2.2 -r1.10.2.3 *** setup.py 13 Jan 2003 06:45:40 -0000 1.10.2.2 --- setup.py 14 Jan 2003 05:35:19 -0000 1.10.2.3 *************** *** 1,13 **** from distutils.core import setup ! import sys, email if email.__version__ < '2.4.3': print "Error: email package version < 2.4.3 found - need newer version" ! print "See INTEGRATION.txt for download information" sys.exit(0) setup( name='spambayes', scripts=['unheader.py', 'hammie.py', --- 1,35 ---- + #!/usr/bin/env python + + import sys + if sys.version < '2.2': + print "Error: Python version too old. You need at least Python 2.2 to use this package." + print "(you're running version %s)"%sys.version + sys.exit(0) + + # Install from distutils.core import setup ! import email if email.__version__ < '2.4.3': print "Error: email package version < 2.4.3 found - need newer version" ! print "See INTEGRATION.txt for download information for email package" sys.exit(0) + # patch distutils if it can't cope with the "classifiers" keyword. + # this just makes it ignore it. + if sys.version < '2.2.3': + from distutils.dist import DistributionMetadata + DistributionMetadata.classifiers = None + + + from spambayes import __version__ setup( name='spambayes', + version = __version__, + description = "Spam classification system", + author = "the spambayes project", + author_email = "spambayes@python.org", + url = "http://spambayes.sourceforge.net", scripts=['unheader.py', 'hammie.py', *************** *** 18,22 **** 'pop3proxy.py', ], ! ! packages = [ 'spambayes', ] ) --- 40,56 ---- 'pop3proxy.py', ], ! packages = [ 'spambayes', ], ! classifiers = [ ! 'Development Status :: 4 - Beta', ! 'Environment :: Console', ! 'License :: OSI Approved :: Python Software Foundation License', ! 'Operating System :: POSIX', ! 'Operating System :: MacOS :: MacOS X', ! 'Operating System :: Microsoft :: Windows :: Windows 95/98/2000', ! 'Operating System :: Microsoft :: Windows :: Windows NT/2000', ! 'Programming Language :: Python', ! 'Intended Audience :: End Users/Desktop', ! 'Topic :: Communications :: Email :: Filters', ! 'Topic :: Communications :: Email :: Post-Office :: POP3', ! ] ) From anthonybaxter at users.sourceforge.net Mon Jan 13 21:38:22 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 14 00:38:26 2003 Subject: [Spambayes-checkins] spambayes/pspam/pspam options.py,1.2,1.3 profile.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes/pspam/pspam In directory sc8-pr-cvs1:/tmp/cvs-serv389/pspam/pspam Modified Files: options.py profile.py Log Message: merging reorg-branch into the trunk. Index: options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pspam/pspam/options.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** options.py 12 Nov 2002 07:03:20 -0000 1.2 --- options.py 14 Jan 2003 05:38:20 -0000 1.3 *************** *** 1,5 **** from Options import options, all_options, \ boolean_cracker, float_cracker, int_cracker, string_cracker ! from sets import Set all_options["Score"] = {'max_ham': float_cracker, --- 1,8 ---- from Options import options, all_options, \ boolean_cracker, float_cracker, int_cracker, string_cracker ! try: ! from sets import Set ! except ImportError: ! from spambayes.compatsets import Set all_options["Score"] = {'max_ham': float_cracker, Index: profile.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pspam/pspam/profile.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** profile.py 26 Nov 2002 17:16:35 -0000 1.6 --- profile.py 14 Jan 2003 05:38:20 -0000 1.7 *************** *** 6,11 **** from BTrees.OOBTree import OOBTree ! import classifier ! from tokenizer import tokenize from pspam.folder import Folder --- 6,11 ---- from BTrees.OOBTree import OOBTree ! from spambayes import classifier ! from spambayes.tokenizer import tokenize from pspam.folder import Folder From anthonybaxter at users.sourceforge.net Mon Jan 13 21:38:22 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 14 00:38:28 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.40,1.41 msgstore.py,1.36,1.37 train.py,1.22,1.23 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv389/Outlook2000 Modified Files: manager.py msgstore.py train.py Log Message: merging reorg-branch into the trunk. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** manager.py 10 Jan 2003 00:33:10 -0000 1.40 --- manager.py 14 Jan 2003 05:38:19 -0000 1.41 *************** *** 44,48 **** os.environ["BAYESCUSTOMIZE"] = ini_filename try: ! import classifier except ImportError: parent = os.path.abspath(os.path.join(os.path.dirname(this_filename), --- 44,48 ---- os.environ["BAYESCUSTOMIZE"] = ini_filename try: ! from spambayes import classifier except ImportError: parent = os.path.abspath(os.path.join(os.path.dirname(this_filename), *************** *** 50,55 **** sys.path.insert(0, parent) ! import classifier ! from tokenizer import tokenize bayes_classifier = classifier bayes_tokenize = tokenize --- 50,55 ---- sys.path.insert(0, parent) ! from spambayes import classifier ! from spambayes.tokenizer import tokenize bayes_classifier = classifier bayes_tokenize = tokenize Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** msgstore.py 25 Nov 2002 05:57:41 -0000 1.36 --- msgstore.py 14 Jan 2003 05:38:20 -0000 1.37 *************** *** 431,435 **** # Note we *dont* look in plain text attachments, which we arguably # should. ! import mboxutils self._EnsureObject() --- 431,435 ---- # Note we *dont* look in plain text attachments, which we arguably # should. ! from spambayes import mboxutils self._EnsureObject() Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** train.py 16 Dec 2002 04:12:00 -0000 1.22 --- train.py 14 Jan 2003 05:38:20 -0000 1.23 *************** *** 34,38 **** # If re-classified AND rescore = True, then a new score will # be written to the message (so the user can see some effects) ! from tokenizer import tokenize was_spam = mgr.message_db.get(msg.searchkey) --- 34,38 ---- # If re-classified AND rescore = True, then a new score will # be written to the message (so the user can see some effects) ! from spambayes.tokenizer import tokenize was_spam = mgr.message_db.get(msg.searchkey) *************** *** 63,67 **** # False == was_ham def untrain_message(msg, mgr): ! from tokenizer import tokenize stream = msg.GetEmailPackageObject() if been_trained_as_spam(msg, mgr): --- 63,67 ---- # False == was_ham def untrain_message(msg, mgr): ! from spambayes.tokenizer import tokenize stream = msg.GetEmailPackageObject() if been_trained_as_spam(msg, mgr): From anthonybaxter at users.sourceforge.net Mon Jan 13 21:38:22 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 14 00:38:30 2003 Subject: [Spambayes-checkins] spambayes/spambayes .cvsignore,1.1,1.2 Corpus.py,1.1,1.2 CostCounter.py,1.1,1.2 FileCorpus.py,1.1,1.2 Histogram.py,1.1,1.2 Options.py,1.1,1.2 TestDriver.py,1.1,1.2 Tester.py,1.1,1.2 __init__.py,1.1,1.2 cdb.py,1.1,1.2 chi2.py,1.1,1.2 classifier.py,1.1,1.2 compatheapq.py,1.1,1.2 compatsets.py,1.1,1.2 dbmstorage.py,1.1,1.2 hammie.py,1.1,1.2 hammiebulk.py,1.1,1.2 mboxutils.py,1.1,1.2 msgs.py,1.1,1.2 optimize.py,1.1,1.2 storage.py,1.1,1.2 tokenizer.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv389/spambayes Added Files: .cvsignore Corpus.py CostCounter.py FileCorpus.py Histogram.py Options.py TestDriver.py Tester.py __init__.py cdb.py chi2.py classifier.py compatheapq.py compatsets.py dbmstorage.py hammie.py hammiebulk.py mboxutils.py msgs.py optimize.py storage.py tokenizer.py Log Message: merging reorg-branch into the trunk. From anthonybaxter at users.sourceforge.net Mon Jan 13 21:38:22 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 14 00:38:32 2003 Subject: [Spambayes-checkins] spambayes OptionConfig.py,1.1,1.2 hammie.py,1.45,1.46 hammiefilter.py,1.5,1.6 hammiesrv.py,1.10,1.11 mailsort.py,1.1,1.2 mboxtrain.py,1.2,1.3 pop3graph.py,1.1,1.2 pop3proxy.py,1.32,1.33 setup.py,1.10,1.11 unheader.py,1.8,1.9 Corpus.py,1.9,NONE CostCounter.py,1.5,NONE FileCorpus.py,1.9,NONE HistToGNU.py,1.7,NONE Histogram.py,1.7,NONE Options.py,1.80,NONE TestDriver.py,1.31,NONE Tester.py,1.9,NONE cdb.py,1.4,NONE chi2.py,1.8,NONE classifier.py,1.62,NONE cmp.py,1.17,NONE dbmstorage.py,1.1,NONE fpfn.py,1.1,NONE hammiebulk.py,1.6,NONE heapq.py,1.1,NONE loosecksum.py,1.4,NONE mboxcount.py,1.3,NONE mboxtest.py,1.11,NONE mboxutils.py,1.7,NONE msgs.py,1.6,NONE optimize.py,1.2,NONE rates.py,1.8,NONE rebal.py,1.9,NONE sets.py,1.2,NONE simplexloop.py,1.2,NONE split.py,1.2,NONE splitn.py,1.4,NONE splitndirs.py,1.7,NONE storage.py,1.8,NONE table.py,1.5,NONE timcv.py,1.12,NONE timtest.py,1.30,NONE tokenizer.py,1.72,NONE weaktest.py,1.6,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv389 Modified Files: OptionConfig.py hammie.py hammiefilter.py hammiesrv.py mailsort.py mboxtrain.py pop3graph.py pop3proxy.py setup.py unheader.py Removed Files: Corpus.py CostCounter.py FileCorpus.py HistToGNU.py Histogram.py Options.py TestDriver.py Tester.py cdb.py chi2.py classifier.py cmp.py dbmstorage.py fpfn.py hammiebulk.py heapq.py loosecksum.py mboxcount.py mboxtest.py mboxutils.py msgs.py optimize.py rates.py rebal.py sets.py simplexloop.py split.py splitn.py splitndirs.py storage.py table.py timcv.py timtest.py tokenizer.py weaktest.py Log Message: merging reorg-branch into the trunk. Index: OptionConfig.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/OptionConfig.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** OptionConfig.py 1 Dec 2002 04:11:39 -0000 1.1 --- OptionConfig.py 14 Jan 2003 05:38:18 -0000 1.2 *************** *** 30,34 **** import SmarterHTTPServer import BaseHTTPServer ! from Options import options import re from cStringIO import StringIO --- 30,34 ---- import SmarterHTTPServer import BaseHTTPServer ! from spambayes.Options import options import re from cStringIO import StringIO Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammie.py,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** hammie.py 2 Dec 2002 04:43:37 -0000 1.45 --- hammie.py 14 Jan 2003 05:38:19 -0000 1.46 *************** *** 1,212 **** ! #! /usr/bin/env python ! ! ! import mboxutils ! import storage ! from Options import options ! from tokenizer import tokenize ! ! try: ! True, False ! except NameError: ! # Maintain compatibility with Python 2.2 ! True, False = 1, 0 ! ! ! class Hammie: ! """A spambayes mail filter. ! ! This implements the basic functionality needed to score, filter, or ! train. ! ! """ ! ! def __init__(self, bayes): ! self.bayes = bayes ! ! def _scoremsg(self, msg, evidence=False): ! """Score a Message. ! ! msg can be a string, a file object, or a Message object. ! ! Returns the probability the message is spam. If evidence is ! true, returns a tuple: (probability, clues), where clues is a ! list of the words which contributed to the score. ! ! """ ! ! return self.bayes.spamprob(tokenize(msg), evidence) ! ! def formatclues(self, clues, sep="; "): ! """Format the clues into something readable.""" ! ! return sep.join(["%r: %.2f" % (word, prob) ! for word, prob in clues ! if (word[0] == '*' or ! prob <= options.clue_mailheader_cutoff or ! prob >= 1.0 - options.clue_mailheader_cutoff)]) ! ! def score(self, msg, evidence=False): ! """Score (judge) a message. ! ! msg can be a string, a file object, or a Message object. ! ! Returns the probability the message is spam. If evidence is ! true, returns a tuple: (probability, clues), where clues is a ! list of the words which contributed to the score. ! ! """ ! ! return self._scoremsg(msg, evidence) ! ! def filter(self, msg, header=None, spam_cutoff=None, ! ham_cutoff=None, debugheader=None, ! debug=None): ! """Score (judge) a message and add a disposition header. ! ! msg can be a string, a file object, or a Message object. ! ! Optionally, set header to the name of the header to add, and/or ! spam_cutoff/ham_cutoff to the probability values which must be met ! or exceeded for a message to get a 'Spam' or 'Ham' classification. ! ! An extra debugging header can be added if 'debug' is set to True. ! The name of the debugging header is given as 'debugheader'. ! ! All defaults for optional parameters come from the Options file. ! ! Returns the same message with a new disposition header. ! ! """ ! ! if header == None: ! header = options.hammie_header_name ! if spam_cutoff == None: ! spam_cutoff = options.spam_cutoff ! if ham_cutoff == None: ! ham_cutoff = options.ham_cutoff ! if debugheader == None: ! debugheader = options.hammie_debug_header_name ! if debug == None: ! debug = options.hammie_debug_header ! ! msg = mboxutils.get_message(msg) ! try: ! del msg[header] ! except KeyError: ! pass ! prob, clues = self._scoremsg(msg, True) ! if prob < ham_cutoff: ! disp = options.header_ham_string ! elif prob > spam_cutoff: ! disp = options.header_spam_string ! else: ! disp = options.header_unsure_string ! disp += ("; %."+str(options.header_score_digits)+"f") % prob ! if options.header_score_logarithm: ! if prob<=0.005 and prob>0.0: ! import math ! x=-math.log10(prob) ! disp += " (%d)"%x ! if prob>=0.995 and prob<1.0: ! import math ! x=-math.log10(1.0-prob) ! disp += " (%d)"%x ! msg.add_header(header, disp) ! if debug: ! disp = self.formatclues(clues) ! msg.add_header(debugheader, disp) ! return msg.as_string(unixfrom=(msg.get_unixfrom() is not None)) ! ! def train(self, msg, is_spam): ! """Train bayes with a message. ! ! msg can be a string, a file object, or a Message object. ! ! is_spam should be 1 if the message is spam, 0 if not. ! ! """ ! ! self.bayes.learn(tokenize(msg), is_spam) ! ! def untrain(self, msg, is_spam): ! """Untrain bayes with a message. ! ! msg can be a string, a file object, or a Message object. ! ! is_spam should be 1 if the message is spam, 0 if not. ! ! """ ! ! self.bayes.unlearn(tokenize(msg), is_spam) ! ! def train_ham(self, msg): ! """Train bayes with ham. ! ! msg can be a string, a file object, or a Message object. ! ! """ ! ! self.train(msg, False) ! ! def train_spam(self, msg): ! """Train bayes with spam. ! ! msg can be a string, a file object, or a Message object. ! ! """ ! ! self.train(msg, True) ! ! def untrain_ham(self, msg): ! """Untrain bayes with ham. ! ! msg can be a string, a file object, or a Message object. ! ! """ ! ! self.untrain(msg, False) ! ! def train_spam(self, msg): ! """Untrain bayes with spam. ! ! msg can be a string, a file object, or a Message object. ! ! """ ! ! self.untrain(msg, True) ! ! def store(self): ! """Write out the persistent store. ! ! This makes sure the persistent store reflects what is currently ! in memory. You would want to do this after a write and before ! exiting. ! ! """ ! ! self.bayes.store() ! ! ! def open(filename, usedb=True, mode='r'): ! """Open a file, returning a Hammie instance. ! ! If usedb is False, open as a pickle instead of a DBDict. mode is ! ! used as the flag to open DBDict objects. 'c' for read-write (create ! if needed), 'r' for read-only, 'w' for read-write. ! ! """ ! ! if usedb: ! b = storage.DBDictClassifier(filename, mode) ! else: ! b = storage.PickledClassifier(filename) ! return Hammie(b) ! if __name__ == "__main__": ! # Everybody's used to running hammie.py. Why mess with success? ;) ! import hammiebulk ! ! hammiebulk.main() --- 1,5 ---- ! #!/usr/bin/env python if __name__ == "__main__": ! import spambayes.hammiebulk ! spambayes.hammiebulk.main() Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** hammiefilter.py 2 Dec 2002 06:02:34 -0000 1.5 --- hammiefilter.py 14 Jan 2003 05:38:19 -0000 1.6 *************** *** 40,46 **** import sys import getopt ! import hammie ! import Options ! import StringIO # See Options.py for explanations of these properties --- 40,44 ---- import sys import getopt ! from spambayes import hammie, Options, StringIO # See Options.py for explanations of these properties Index: hammiesrv.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiesrv.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** hammiesrv.py 7 Nov 2002 22:30:06 -0000 1.10 --- hammiesrv.py 14 Jan 2003 05:38:19 -0000 1.11 *************** *** 28,32 **** import traceback import xmlrpclib ! import hammie try: --- 28,32 ---- import traceback import xmlrpclib ! from spambayes import hammie try: Index: mailsort.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mailsort.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** mailsort.py 29 Nov 2002 00:57:23 -0000 1.1 --- mailsort.py 14 Jan 2003 05:38:19 -0000 1.2 *************** *** 23,31 **** import socket import email ! import mboxutils ! import cdb ! from tokenizer import tokenize ! import classifier --- 23,31 ---- import socket import email ! from spambayes import mboxutils ! from spambayes import cdb ! from spambayes.tokenizer import tokenize ! from spambayes import classifier Index: mboxtrain.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mboxtrain.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** mboxtrain.py 11 Dec 2002 16:21:15 -0000 1.2 --- mboxtrain.py 14 Jan 2003 05:38:19 -0000 1.3 *************** *** 35,43 **** """ ! import mboxutils ! import getopt ! import hammie ! import sys ! import os program = sys.argv[0] --- 35,40 ---- """ ! import sys, os, getopt ! from spambayes import hammie, mboxutils program = sys.argv[0] Index: pop3graph.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3graph.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** pop3graph.py 20 Nov 2002 12:30:16 -0000 1.1 --- pop3graph.py 14 Jan 2003 05:38:19 -0000 1.2 *************** *** 5,11 **** from __future__ import division ! import sys, mboxutils ! from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory ! from Options import options def main(): --- 5,12 ---- from __future__ import division ! import sys ! from spambayes import mboxutils ! from spambayes.FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory ! from spambayes.Options import options def main(): Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** pop3proxy.py 3 Dec 2002 21:22:22 -0000 1.32 --- pop3proxy.py 14 Jan 2003 05:38:19 -0000 1.33 *************** *** 141,149 **** import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect import socket, asyncore, asynchat, cgi, urlparse, webbrowser ! import mailbox, storage, tokenizer, mboxutils, email.Header ! from FileCorpus import FileCorpus, ExpiryFileCorpus ! from FileCorpus import FileMessageFactory, GzipFileMessageFactory from email.Iterators import typed_subpart_iterator ! from Options import options # HEADER_EXAMPLE is the longest possible header - the length of this one --- 141,150 ---- import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect import socket, asyncore, asynchat, cgi, urlparse, webbrowser ! import mailbox, email.Header ! from spambayes import storage, tokenizer, mboxutils ! from spambayes.FileCorpus import FileCorpus, ExpiryFileCorpus ! from spambayes.FileCorpus import FileMessageFactory, GzipFileMessageFactory from email.Iterators import typed_subpart_iterator ! from spambayes.Options import options # HEADER_EXAMPLE is the longest possible header - the length of this one Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** setup.py 11 Dec 2002 04:56:30 -0000 1.10 --- setup.py 14 Jan 2003 05:38:19 -0000 1.11 *************** *** 1,37 **** from distutils.core import setup setup( name='spambayes', scripts=['unheader.py', 'hammie.py', 'hammiecli.py', 'hammiesrv.py', ! 'loosecksum.py', ! 'timtest.py', ! 'timcv.py', ! 'splitndirs.py', ! 'runtest.sh', ! 'rebal.py', ! 'HistToGNU.py', ! 'mboxcount.py', ! 'mboxtest.py', ! 'cmp.py', ! 'table.py', ! 'rates.py', ], ! py_modules=['classifier', ! 'tokenizer', ! 'hammie', ! 'msgs', ! 'storage', ! 'dbmstorage', ! 'Corpus', ! 'hammiebulk', ! 'chi2', ! 'Histogram', ! 'Options', ! 'Tester', ! 'TestDriver', ! 'mboxutils', ! ] ) --- 1,56 ---- + #!/usr/bin/env python + + import sys + if sys.version < '2.2': + print "Error: Python version too old. You need at least Python 2.2 to use this package." + print "(you're running version %s)"%sys.version + sys.exit(0) + + # Install from distutils.core import setup + import email + if email.__version__ < '2.4.3': + print "Error: email package version < 2.4.3 found - need newer version" + print "See INTEGRATION.txt for download information for email package" + sys.exit(0) + + # patch distutils if it can't cope with the "classifiers" keyword. + # this just makes it ignore it. + if sys.version < '2.2.3': + from distutils.dist import DistributionMetadata + DistributionMetadata.classifiers = None + + + from spambayes import __version__ + setup( name='spambayes', + version = __version__, + description = "Spam classification system", + author = "the spambayes project", + author_email = "spambayes@python.org", + url = "http://spambayes.sourceforge.net", scripts=['unheader.py', 'hammie.py', 'hammiecli.py', 'hammiesrv.py', ! 'hammiefilter.py', ! 'pop3graph.py', ! 'pop3proxy.py', ], ! packages = [ 'spambayes', ], ! classifiers = [ ! 'Development Status :: 4 - Beta', ! 'Environment :: Console', ! 'License :: OSI Approved :: Python Software Foundation License', ! 'Operating System :: POSIX', ! 'Operating System :: MacOS :: MacOS X', ! 'Operating System :: Microsoft :: Windows :: Windows 95/98/2000', ! 'Operating System :: Microsoft :: Windows :: Windows NT/2000', ! 'Programming Language :: Python', ! 'Intended Audience :: End Users/Desktop', ! 'Topic :: Communications :: Email :: Filters', ! 'Topic :: Communications :: Email :: Post-Office :: POP3', ! ] ) Index: unheader.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/unheader.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** unheader.py 27 Oct 2002 22:10:28 -0000 1.8 --- unheader.py 14 Jan 2003 05:38:19 -0000 1.9 *************** *** 1,3 **** --- 1,11 ---- #!/usr/bin/env python + """ + unheader.py: cleans headers from email messages. By default, this + removes SpamAssassin headers, specify a pattern with -p to supply + new headers to remove. + + This is often needed because existing spamassassin headers can + provide killer spam clues, for all the wrong reasons. + """ import re --- Corpus.py DELETED --- --- CostCounter.py DELETED --- --- FileCorpus.py DELETED --- --- HistToGNU.py DELETED --- --- Histogram.py DELETED --- --- Options.py DELETED --- --- TestDriver.py DELETED --- --- Tester.py DELETED --- --- cdb.py DELETED --- --- chi2.py DELETED --- --- classifier.py DELETED --- --- cmp.py DELETED --- --- dbmstorage.py DELETED --- --- fpfn.py DELETED --- --- hammiebulk.py DELETED --- --- heapq.py DELETED --- --- loosecksum.py DELETED --- --- mboxcount.py DELETED --- --- mboxtest.py DELETED --- --- mboxutils.py DELETED --- --- msgs.py DELETED --- --- optimize.py DELETED --- --- rates.py DELETED --- --- rebal.py DELETED --- --- sets.py DELETED --- --- simplexloop.py DELETED --- --- split.py DELETED --- --- splitn.py DELETED --- --- splitndirs.py DELETED --- --- storage.py DELETED --- --- table.py DELETED --- --- timcv.py DELETED --- --- timtest.py DELETED --- --- tokenizer.py DELETED --- --- weaktest.py DELETED --- From anthonybaxter at users.sourceforge.net Mon Jan 13 21:48:14 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 14 00:48:17 2003 Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv3042/spambayes Modified Files: Options.py Log Message: ham_cutoff and spam_cutoff moved. This Will Break Your BayesCustomize.Ini Files. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** Options.py 14 Jan 2003 05:38:20 -0000 1.2 --- Options.py 14 Jan 2003 05:48:12 -0000 1.3 *************** *** 133,138 **** replace_nonascii_chars: False ! [TestDriver] ! # These control various displays in class TestDriver.Driver, and Tester.Test. # spam_cutoff and ham_cutoff are used in Python slice sense: --- 133,138 ---- replace_nonascii_chars: False ! [Categorization] ! # These options control how a message is categorized # spam_cutoff and ham_cutoff are used in Python slice sense: *************** *** 160,163 **** --- 160,167 ---- spam_cutoff: 0.90 + [TestDriver] + # These control various displays in class TestDriver.Driver, and Tester.Test. + + # Number of buckets in histograms. nbuckets: 200 *************** *** 409,412 **** --- 413,419 ---- 'replace_nonascii_chars': boolean_cracker, }, + 'Categorization': { 'ham_cutoff': float_cracker, + 'spam_cutoff': float_cracker, + }, 'TestDriver': {'nbuckets': int_cracker, 'show_ham_lo': float_cracker, *************** *** 423,428 **** 'pickle_basename': string_cracker, 'show_charlimit': int_cracker, - 'ham_cutoff': float_cracker, - 'spam_cutoff': float_cracker, 'spam_directories': string_cracker, 'ham_directories': string_cracker, --- 430,433 ---- From montanaro at users.sourceforge.net Tue Jan 14 09:11:12 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Tue Jan 14 12:11:15 2003 Subject: [Spambayes-checkins] spambayes pop3graph.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv21813 Modified Files: pop3graph.py Log Message: add #! line and grok -h/--help flags Index: pop3graph.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3graph.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** pop3graph.py 14 Jan 2003 05:38:19 -0000 1.2 --- pop3graph.py 14 Jan 2003 17:10:57 -0000 1.3 *************** *** 1,2 **** --- 1,4 ---- + #!/usr/bin/env python + """Analyse the pop3proxy's caches and produce a graph of how accurate classifier has been over time. Only really meaningful if you started *************** *** 6,14 **** import sys from spambayes import mboxutils from spambayes.FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory from spambayes.Options import options ! def main(): # Create the corpuses and the factory that reads the messages. if options.pop3proxy_cache_use_gzip: --- 8,27 ---- import sys + import getopt + from spambayes import mboxutils from spambayes.FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory from spambayes.Options import options ! def usage(): ! print __doc__ ! ! def main(argv): ! opts, args = getopt.getopt(argv, "h", ["help"]) ! for opt, arg in opts: ! if opt in ("-h", "--help"): ! usage() ! return ! # Create the corpuses and the factory that reads the messages. if options.pop3proxy_cache_use_gzip: *************** *** 73,75 **** if __name__ == '__main__': ! main() --- 86,88 ---- if __name__ == '__main__': ! main(sys.argv[1:]) From montanaro at users.sourceforge.net Tue Jan 14 09:13:05 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Tue Jan 14 12:13:08 2003 Subject: [Spambayes-checkins] spambayes/spambayes hammiebulk.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv23615 Modified Files: hammiebulk.py Log Message: support untraining and print # of unsures at the end Index: hammiebulk.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/hammiebulk.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** hammiebulk.py 14 Jan 2003 05:38:20 -0000 1.2 --- hammiebulk.py 14 Jan 2003 17:12:53 -0000 1.3 *************** *** 19,23 **** exists, and saves data to this file at the end. Default: %(DEFAULTDB)s ! -f run as a filter: read a single message from stdin, add a new --- 19,25 ---- exists, and saves data to this file at the end. Default: %(DEFAULTDB)s ! -U ! Untrain instead of train. The interpretation of -g and -s remains ! the same. -f run as a filter: read a single message from stdin, add a new *************** *** 82,85 **** --- 84,98 ---- print + def untrain(h, msgs, is_spam): + """Untrain bayes with all messages from a mailbox.""" + mbox = mboxutils.getmbox(msgs) + i = 0 + for msg in mbox: + i += 1 + sys.stdout.write("\r%6d" % i) + sys.stdout.flush() + h.untrain(msg, is_spam) + print + def score(h, msgs, reverse=0): """Score (judge) all messages from a mailbox.""" *************** *** 87,91 **** mbox = mboxutils.getmbox(msgs) i = 0 ! spams = hams = 0 for msg in mbox: i += 1 --- 100,104 ---- mbox = mboxutils.getmbox(msgs) i = 0 ! spams = hams = unsures = 0 for msg in mbox: i += 1 *************** *** 96,99 **** --- 109,113 ---- msgno = i isspam = (prob >= SPAM_THRESHOLD) + isham = (prob <= HAM_THRESHOLD) if isspam: spams += 1 *************** *** 101,110 **** print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."), print h.formatclues(clues) ! else: hams += 1 if reverse: ! print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."), print h.formatclues(clues) ! return (spams, hams) def usage(code, msg=''): --- 115,128 ---- print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."), print h.formatclues(clues) ! elif isham: hams += 1 if reverse: ! print "%6s %4.2f %1s" % (msgno, prob, isham and "S" or "."), print h.formatclues(clues) ! else: ! unsures += 1 ! print "%6s %4.2f U" % (msgno, prob), ! print h.formatclues(clues) ! return (spams, hams, unsures) def usage(code, msg=''): *************** *** 119,123 **** """Main program; parse options and go.""" try: ! opts, args = getopt.getopt(sys.argv[1:], 'hdDfg:s:p:u:r') except getopt.error, msg: usage(2, msg) --- 137,141 ---- """Main program; parse options and go.""" try: ! opts, args = getopt.getopt(sys.argv[1:], 'hdDUfg:s:p:u:r') except getopt.error, msg: usage(2, msg) *************** *** 131,134 **** --- 149,153 ---- unknown = [] reverse = 0 + untrain = 0 do_filter = False usedb = None *************** *** 153,156 **** --- 172,177 ---- elif opt == '-u': unknown.append(arg) + elif opt == '-U': + untrain = 1 elif opt == '-r': reverse = 1 *************** *** 165,178 **** h = hammie.open(pck, usedb, mode) ! for g in good: ! print "Training ham (%s):" % g ! train(h, g, False) ! save = True ! for s in spam: ! print "Training spam (%s):" % s ! train(h, s, True) ! save = True if save: h.store() --- 186,210 ---- h = hammie.open(pck, usedb, mode) ! if not untrain: ! for g in good: ! print "Training ham (%s):" % g ! train(h, g, False) ! save = True ! for s in spam: ! print "Training spam (%s):" % s ! train(h, s, True) ! save = True ! else: ! for g in good: ! print "Untraining ham (%s):" % g ! untrain(h, g, False) ! save = True + for s in spam: + print "Untraining spam (%s):" % s + untrain(h, s, True) + save = True + if save: h.store() *************** *** 184,195 **** if unknown: ! (spams, hams) = (0, 0) for u in unknown: if len(unknown) > 1: print "Scoring", u ! s, g = score(h, u, reverse) spams += s hams += g ! print "Total %d spam, %d ham" % (spams, hams) if __name__ == "__main__": --- 216,228 ---- if unknown: ! spams = hams = unsures = 0 for u in unknown: if len(unknown) > 1: print "Scoring", u ! s, g, u = score(h, u, reverse) spams += s hams += g ! unsures += u ! print "Total %d spam, %d ham, %d unsure" % (spams, hams, unsures) if __name__ == "__main__": From anthonybaxter at users.sourceforge.net Tue Jan 14 19:37:05 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 14 22:37:09 2003 Subject: [Spambayes-checkins] website index.ht,1.4,1.5 background.ht,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv27893 Modified Files: index.ht background.ht Log Message: more documentation on "how it works" Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** index.ht 13 Jan 2003 03:57:58 -0000 1.4 --- index.ht 15 Jan 2003 03:37:02 -0000 1.5 *************** *** 6,17 ****

    This project is developing a Bayesian anti-spam classifier, initially ! based on the work of Paul Graham. A major difference between this project and many others that are doing similar work is the emphasis on testing and trialing newer and different approaches to scoring messages. While most projects are still working with the initial Graham approach, we found that a number of different approaches yielded a much more useful response. An attempt at documenting this is on the background page.

    Code is currently available via CVS - ! note that it's not yet ! suitable for non-technical end-users, but for people interested ! in experimenting.

    --- 6,28 ----

    This project is developing a Bayesian anti-spam classifier, initially ! based on the work of Paul Graham. A major difference between this project and many others that are doing similar work is the emphasis on testing and trialling newer and different approaches to scoring messages. While most projects are still working with the initial Graham approach, we found that a number of different approaches yielded a much more useful response. An attempt at documenting this is on the background page.

    +

    That's great, but what is SpamBayes?

    +

    (the non-technical hand-waving answer)

    +

    Spambayes attempts to classify incoming email messages as 'spam', 'ham' (good, non-spam email) or 'unsure'. It does this by first being trained on sample spam and ham messages. Think of this process as showing spambayes "this pile is email that I like, and this pile is email that I don't like". These messages are analyzed for words, clues from the mailer headers, and other esoteric hints, so that the system can figure out what makes the piles different. The system then uses these clues to examine new messages.

    +

    For instance, the word "Nigeria" is a common one in spam - so you could simply try a spam filter where 'anything with the word "Nigeria" in it is spam'. This approach has a few pitfalls - what if your business involves writing a guidebook for Nigeria, for example? Over time, spammers will adapt, and will no longer use words like 'Nigeria' (or 'Lose weight fast', or any of the other hucksterish lines). Ideally, we want something that adapts as the spammers adapt.

    +

    So, this is what spambayes does. It looks at the spam, and looks at the ham, and calculates probabilities. For instance, for me, the word "weight" almost never occurs in legitimate email, but it occurs all the time in 'lose weight fast spam'. So for my setup, the word "weight" is a good indication of spamminess. Spambayes looks at incoming email messages, extracts the most significant clues (ones that occur mostly in spam, or mostly in ham), and combines the probabilities to produce an overall rating of 'spamminess'.

    + +

    How is SpamBayes different?

    +

    There are a number of similar projects to spambayes - most are just using the original Paul Graham algorithm. Examining the Graham technique with careful testing showed that it did a remarkably good job, but there was considerable room for improvement - in particular, when it got something wrong, it got it completely wrong. The spambayes team (primarily Tim Peters with a supporting cast providing testing, heckling, and different ideas) tinkered with new algorithms, tweaking existing algorithms, and, most importantly, did enormous test runs, slamming tens of thousands of messages against tens of thousands of messages, in an attempt to quantify whether or not a change to the system was beneficial.

    +

    The new algorithm is a combination of work from Gary Robinson and Tim Peters, and provides not just a 'spam' and 'ham' rating, but also an 'unsure' rating, for those messages where it just can't work out how to rate the message.

    +

    For more on this, see the background page.

    + +

    Code is currently available via CVS - ! note that it's not quite yet suitable for non-technical end-users, but ! for people interested in experimenting.

    Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** background.ht 13 Jan 2003 08:08:58 -0000 1.8 --- background.ht 15 Jan 2003 03:37:02 -0000 1.9 *************** *** 18,32 ****

    Overall Approach

    ! Please note that I (Anthony) am writing this based on memory and ! limited understanding of some of the subtler points of the maths. Gentle ! corrections are welcome, or even encouraged.

    Tokenizing

    The architecture of the spambayes system has a couple of distinct parts. The first, and most obvious, is the tokenizer. This takes ! a mail message and breaks it up into a series of tokens. At the moment ! it splits words out of the text parts of a message, there's a variety ! of header tokenization that goes on as well. The code in tokenizer.py ! and the comments in the Tokenizer section of Options.py contain more ! information about various approaches to tokenizing.

    Combining and Scoring

    --- 18,36 ----

    Overall Approach

    !

    Please note that I (Anthony) am writing this based on memory and limited understanding of some of the subtler points of the maths. Gentle corrections are welcome, or even encouraged.

    !

    Tokenizing

    The architecture of the spambayes system has a couple of distinct parts. The first, and most obvious, is the tokenizer. This takes ! a mail message and breaks it up into a series of tokens (words). At the ! moment it splits words out of the text parts of a message, stripping out ! various HTML snippets and other bits of junk, such as images. In addition, ! there's a variety of mail header interpretation and tokenization that goes ! on as well. The code in tokenizer.py and the comments in the Tokenizer ! section of Options.py contain more information about various approaches ! to tokenizing, as well as various things that have been tried and found ! to make little or no difference.

    ! !

    Combining and Scoring

    *************** *** 34,54 ****

    The next part of the system is the scoring and combining part. This is where the hairy mathematics and statistics come in.

    Initially we started with Paul Graham's original combining scheme - a "Naive Bayes" scheme, of sorts - this has a number of "magic numbers" and "fuzz factors" built into it. -

    Gary's essay, linked above, has this to say on the 'Bayesianess' - of the original Graham scheme:
    - - Paul's approach has become fairly famous for filtering spam in a Bayesian way. That's only true if we make fairly large leaps of the imagination. Originally after reading his issay I thought that it was in no way Bayesian, but I have since noticed that if and only if a particular less-than-optimal assumption is made, part of it could be viewed as Bayesian through a very obscure argument. But it's a pretty remote case for Bayesianness. In any case, there's no need to dwell on Bayesianness or non-Bayesianness; we have bigger fish to fry. (Note: Tim Peters of spambayes fame has posted another way of looking at Paul's approach as Bayesian, although to do so he needs to make the unrealistic assumption that spams and non-spams are equally likely.) -

    -

    The Graham combining scheme has a number of problems, aside from the magic in the internal fudge factors - it tends to produce scores of ! either 1 or 0, and there's a very small middle ground in between - it doesn't often claim to be "unsure", and gets it wrong because of this. The following plot shows the problem:

    Note:In each of these plots, the X axis shows the 'score' of the message, scaled to 0-100. ! (where 0 is "definately ham", and 100 is "definately spam"), and the Y axis shows the number of messages with that score (scaled logarithmically). Note also that the plots aren't from the same data --- 38,54 ----

    The next part of the system is the scoring and combining part. This is where the hairy mathematics and statistics come in.

    +

    Initially we started with Paul Graham's original combining scheme - a "Naive Bayes" scheme, of sorts - this has a number of "magic numbers" and "fuzz factors" built into it.

    The Graham combining scheme has a number of problems, aside from the magic in the internal fudge factors - it tends to produce scores of ! either 1 (definite spam) or 0 (definite ham), and there's a very small ! middle ground in between - it doesn't often claim to be "unsure", and gets it wrong because of this. The following plot shows the problem:

    Note:In each of these plots, the X axis shows the 'score' of the message, scaled to 0-100. ! (where 0 is "definitely ham", and 100 is "definitely spam"), and the Y axis shows the number of messages with that score (scaled logarithmically). Note also that the plots aren't from the same data *************** *** 114,118 ****

    Training

    !

    TBD

    Mailing list archives

    --- 114,132 ----

    Training

    !

    TBD

    ! !

    Testing

    !

    TBD, describe test setup

    !

    One big difference between spambayes and many other open source projects is ! that there's a large amount of testing done. Before any change to the tokenizer ! or the algorithms was checked in, it was necessary to show that the change ! actually produced an improvement. Many fine ideas that seemed like ! they should make a positive difference to the results actually turned out to ! have no impact, or even made things worse. About the only "general rule" that ! kept showing up was Stupid beats smart. That is, a ! fiddly and delicate piece of tokenization magic would often produce worse ! results than something that just took a brute force "just grab everything" ! approach.

    !

    Mailing list archives

    From mhammond at users.sourceforge.net Tue Jan 14 19:52:11 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Tue Jan 14 22:52:15 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs FolderSelector.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory sc8-pr-cvs1:/tmp/cvs-serv28992 Modified Files: FolderSelector.py Log Message: Ignore folders that fail when querying their standard properties - as reported by Tony Meyer. Index: FolderSelector.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FolderSelector.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** FolderSelector.py 7 Nov 2002 22:30:10 -0000 1.9 --- FolderSelector.py 15 Jan 2003 03:52:09 -0000 1.10 *************** *** 111,123 **** ## - An Outlook object model version def _BuildFolderTreeOutlook(session, parent): children = [] for i in range(parent.Folders.Count): folder = parent.Folders[i+1] ! spec = FolderSpec((folder.StoreID, folder.EntryID), ! folder.Name.encode("mbcs", "replace")) ! if folder.Folders: ! spec.children = _BuildFolderTreeOutlook(session, folder) ! children.append(spec) return children --- 111,129 ---- ## - An Outlook object model version + import pythoncom def _BuildFolderTreeOutlook(session, parent): children = [] for i in range(parent.Folders.Count): folder = parent.Folders[i+1] ! try: ! spec = FolderSpec((folder.StoreID, folder.EntryID), ! folder.Name.encode("mbcs", "replace")) ! except pythoncom.error: ! # Something strange with this folder - just ignore it ! spec = None ! if spec is not None: ! if folder.Folders: ! spec.children = _BuildFolderTreeOutlook(session, folder) ! children.append(spec) return children From montanaro at users.sourceforge.net Wed Jan 15 13:23:06 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Wed Jan 15 16:23:09 2003 Subject: [Spambayes-checkins] spambayes hammiefilter.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv14180 Modified Files: hammiefilter.py Log Message: zap bogus StringIO import Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** hammiefilter.py 14 Jan 2003 05:38:19 -0000 1.6 --- hammiefilter.py 15 Jan 2003 21:23:04 -0000 1.7 *************** *** 40,44 **** import sys import getopt ! from spambayes import hammie, Options, StringIO # See Options.py for explanations of these properties --- 40,44 ---- import sys import getopt ! from spambayes import hammie, Options # See Options.py for explanations of these properties From montanaro at users.sourceforge.net Wed Jan 15 14:39:45 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Wed Jan 15 17:39:47 2003 Subject: [Spambayes-checkins] spambayes OptionConfig.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv27526 Modified Files: OptionConfig.py Log Message: delete unused cStringIO import Index: OptionConfig.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/OptionConfig.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** OptionConfig.py 14 Jan 2003 05:38:18 -0000 1.2 --- OptionConfig.py 15 Jan 2003 22:39:42 -0000 1.3 *************** *** 32,36 **** from spambayes.Options import options import re - from cStringIO import StringIO import os import ConfigParser --- 32,35 ---- From montanaro at users.sourceforge.net Wed Jan 15 14:41:17 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Wed Jan 15 17:41:20 2003 Subject: [Spambayes-checkins] spambayes hammiefilter.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv27803 Modified Files: hammiefilter.py Log Message: minor tweak to the usage message. add --help as a valid option for people used to GNU long options Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** hammiefilter.py 15 Jan 2003 21:23:04 -0000 1.7 --- hammiefilter.py 15 Jan 2003 22:41:15 -0000 1.8 *************** *** 17,20 **** --- 17,29 ---- """Usage: %(program)s [OPTION] + A hammie front-end to make the simple stuff simple. The intent is to call + this from procmail and its ilk like so: + + :0 fw + | hammiefilter.py + + Then, you can set up your MUA to pipe ham and spam to it, one at a time, by + calling it with either the -g or -s options, respectively. + Where [OPTION] is one of: -h *************** *** 104,110 **** h = HammieFilter() action = h.filter ! opts, args = getopt.getopt(sys.argv[1:], 'hngsGS') for opt, arg in opts: ! if opt == '-h': usage(0) elif opt == '-g': --- 113,119 ---- h = HammieFilter() action = h.filter ! opts, args = getopt.getopt(sys.argv[1:], 'hngsGS', ['help']) for opt, arg in opts: ! if opt in ('-h', '--help'): usage(0) elif opt == '-g': From montanaro at users.sourceforge.net Wed Jan 15 14:45:13 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Wed Jan 15 17:45:17 2003 Subject: [Spambayes-checkins] spambayes/pspam pop.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/pspam In directory sc8-pr-cvs1:/tmp/cvs-serv28377/pspam Modified Files: pop.py Log Message: Unify StringIO imports to always try cStringIO and fallback to StringIO Index: pop.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pspam/pop.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** pop.py 7 Nov 2002 22:30:10 -0000 1.3 --- pop.py 15 Jan 2003 22:45:11 -0000 1.4 *************** *** 29,33 **** import SocketServer import asyncore ! import cStringIO import email import re --- 29,37 ---- import SocketServer import asyncore ! try: ! import cStringIO as StringIO ! except ImportError: ! import StringIO ! import email import re *************** *** 190,194 **** if multiline: # Collect the entire response as one string ! resp = cStringIO.StringIO() while 1: line = self.pop_rfile.readline() --- 194,198 ---- if multiline: # Collect the entire response as one string ! resp = StringIO.StringIO() while 1: line = self.pop_rfile.readline() From montanaro at users.sourceforge.net Wed Jan 15 14:45:13 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Wed Jan 15 17:45:20 2003 Subject: [Spambayes-checkins] spambayes SmarterHTTPServer.py,1.1,1.2 pop3proxy.py,1.33,1.34 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv28377 Modified Files: SmarterHTTPServer.py pop3proxy.py Log Message: Unify StringIO imports to always try cStringIO and fallback to StringIO Index: SmarterHTTPServer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/SmarterHTTPServer.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** SmarterHTTPServer.py 1 Dec 2002 04:11:39 -0000 1.1 --- SmarterHTTPServer.py 15 Jan 2003 22:45:11 -0000 1.2 *************** *** 24,28 **** import mimetypes import re ! from StringIO import StringIO --- 24,31 ---- import mimetypes import re ! try: ! import cStringIO as StringIO ! except ImportError: ! import StringIO *************** *** 96,100 **** self.send_response(200) retstr = getattr(self, methname)(pdict) ! f = StringIO(retstr) self.send_header("Content-type", 'text/html') self.end_headers() --- 99,103 ---- self.send_response(200) retstr = getattr(self, methname)(pdict) ! f = StringIO.StringIO(retstr) self.send_header("Content-type", 'text/html') self.end_headers() Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** pop3proxy.py 14 Jan 2003 05:38:19 -0000 1.33 --- pop3proxy.py 15 Jan 2003 22:45:11 -0000 1.34 *************** *** 139,143 **** """ ! import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect import socket, asyncore, asynchat, cgi, urlparse, webbrowser import mailbox, email.Header --- 139,148 ---- """ ! try: ! import cStringIO as StringIO ! except ImportError: ! import StringIO ! ! import os, sys, re, operator, errno, getopt, string, time, bisect import socket, asyncore, asynchat, cgi, urlparse, webbrowser import mailbox, email.Header *************** *** 794,798 **** if contentType == 'multipart/form-data': # multipart/form-data - probably a file upload. ! bodyFile = cStringIO.StringIO(body) params.update(cgi.parse_multipart(bodyFile, pdict)) else: --- 799,803 ---- if contentType == 'multipart/form-data': # multipart/form-data - probably a file upload. ! bodyFile = StringIO.StringIO(body) params.update(cgi.parse_multipart(bodyFile, pdict)) else: *************** *** 934,938 **** def __init__(self, fp): self.guts = fp.read() ! contentFile = cStringIO.StringIO(content) mbox = mailbox.PortableUnixMailbox(contentFile, SimpleMessage) messages = map(lambda m: m.guts, mbox) --- 939,943 ---- def __init__(self, fp): self.guts = fp.read() ! contentFile = StringIO.StringIO(content) mbox = mailbox.PortableUnixMailbox(contentFile, SimpleMessage) messages = map(lambda m: m.guts, mbox) From montanaro at users.sourceforge.net Wed Jan 15 14:47:14 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Wed Jan 15 17:47:16 2003 Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv28734/spambayes Modified Files: Options.py Log Message: Unify import of StringIO - always try cStringIO and fall back to StringIO Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** Options.py 14 Jan 2003 05:48:12 -0000 1.3 --- Options.py 15 Jan 2003 22:47:11 -0000 1.4 *************** *** 5,9 **** import sys, os ! import StringIO import ConfigParser try: --- 5,12 ---- import sys, os ! try: ! import cStringIO as StringIO ! except ImportError: ! import StringIO import ConfigParser try: From anthonybaxter at users.sourceforge.net Wed Jan 15 17:54:51 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Wed Jan 15 20:54:56 2003 Subject: [Spambayes-checkins] website docs.ht,1.5,1.6 index.ht,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv16164 Modified Files: docs.ht index.ht Log Message: Cleaned up my (ab)use of the english language in the index.html page. Index: docs.ht =================================================================== RCS file: /cvsroot/spambayes/website/docs.ht,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** docs.ht 30 Dec 2002 07:37:15 -0000 1.5 --- docs.ht 16 Jan 2003 01:54:49 -0000 1.6 *************** *** 20,24 ****

    !

    Glossary

    A useful(?) glossary of terminology

    --- 20,24 ----

    !

    Glossary

    A useful(?) glossary of terminology

    *************** *** 40,43 **** --- 40,46 ----
    hapax, hapax legomenon
    a word or form occuring only once in a document or corpus. (plural is hapax legomena) +
    training
    The process of feeding spambayes some sample spam and ham messages, to teach it what to look for. +
    bayesian
    A form of statistical analysis used (in a form) in Paul Graham's + initial "Plan for Spam" approach. Now used as a kind of catch-all term for this class of filters, no doubt horrorifying statisticians everywhere.
    Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** index.ht 15 Jan 2003 03:37:02 -0000 1.5 --- index.ht 16 Jan 2003 01:54:49 -0000 1.6 *************** *** 1,29 **** Title: SpamBayes: Bayesian anti-spam classifier written in Python. ! Author-Email: spambayes@python.org ! Author: spambayes

    What is SpamBayes?

    ! This project is developing a Bayesian anti-spam classifier, initially ! based on the work of Paul Graham. A major difference between this project and many others that are doing similar work is the emphasis on testing and trialling newer and different approaches to scoring messages. While most projects are still working with the initial Graham approach, we found that a number of different approaches yielded a much more useful response. An attempt at documenting this is on the background page.

    !

    That's great, but what is SpamBayes?

    (the non-technical hand-waving answer)

    !

    Spambayes attempts to classify incoming email messages as 'spam', 'ham' (good, non-spam email) or 'unsure'. It does this by first being trained on sample spam and ham messages. Think of this process as showing spambayes "this pile is email that I like, and this pile is email that I don't like". These messages are analyzed for words, clues from the mailer headers, and other esoteric hints, so that the system can figure out what makes the piles different. The system then uses these clues to examine new messages.

    !

    For instance, the word "Nigeria" is a common one in spam - so you could simply try a spam filter where 'anything with the word "Nigeria" in it is spam'. This approach has a few pitfalls - what if your business involves writing a guidebook for Nigeria, for example? Over time, spammers will adapt, and will no longer use words like 'Nigeria' (or 'Lose weight fast', or any of the other hucksterish lines). Ideally, we want something that adapts as the spammers adapt.

    !

    So, this is what spambayes does. It looks at the spam, and looks at the ham, and calculates probabilities. For instance, for me, the word "weight" almost never occurs in legitimate email, but it occurs all the time in 'lose weight fast spam'. So for my setup, the word "weight" is a good indication of spamminess. Spambayes looks at incoming email messages, extracts the most significant clues (ones that occur mostly in spam, or mostly in ham), and combines the probabilities to produce an overall rating of 'spamminess'.

    How is SpamBayes different?

    !

    There are a number of similar projects to spambayes - most are just using the original Paul Graham algorithm. Examining the Graham technique with careful testing showed that it did a remarkably good job, but there was considerable room for improvement - in particular, when it got something wrong, it got it completely wrong. The spambayes team (primarily Tim Peters with a supporting cast providing testing, heckling, and different ideas) tinkered with new algorithms, tweaking existing algorithms, and, most importantly, did enormous test runs, slamming tens of thousands of messages against tens of thousands of messages, in an attempt to quantify whether or not a change to the system was beneficial.

    !

    The new algorithm is a combination of work from Gary Robinson and Tim Peters, and provides not just a 'spam' and 'ham' rating, but also an 'unsure' rating, for those messages where it just can't work out how to rate the message.

    !

    For more on this, see the background page.

    !

    Code is currently available via CVS - note that it's not quite yet suitable for non-technical end-users, but for people interested in experimenting.

    There are now a couple of end-user applications available for those --- 1,96 ---- Title: SpamBayes: Bayesian anti-spam classifier written in Python. ! Author-Email: SpamBayes@python.org ! Author: SpamBayes

    What is SpamBayes?

    ! The SpamBayes ! project is working on developing a Bayesian ! anti-spam filter, initially based on the work of ! Paul Graham. ! ! The major difference between this and other, similar projects ! is the emphasis on testing newer approaches to scoring messages. ! ! While most anti-spam projects are still working with the ! original graham algorithm, we found that a number of alternate methods ! yielded a more useful response. ! ! This is documented on the background page.

    !

    That's great, but what's SpamBayes?

    (the non-technical hand-waving answer)

    ! !

    SpamBayes will attempt to classify incoming email messages as 'spam', ! 'ham' (good, non-spam email) or 'unsure'. This means you can have spam ! or unsure messages automatically filed away in a different mail folder, ! where it won't interrupt your email reading. First SpamBayes must be ! trained by each user to identify spam ! and ham. ! ! Essentially, you show SpamBayes a pile of email that you like (ham) ! and a pile you don't like (spam). ! ! SpamBayes will then analyze the piles for clues as to what makes the ! spam and ham different. For example; different words, differences in the ! mailer headers and content style. The system then uses these clues to ! examine new messages.

    ! !

    For instance, the word "Nigeria" appears often in spam, so ! you could use a spam filter which identifies anything with that word in ! It as spam. But what if your business involves writing a guidebook on ! Nigerian Wildlife Conservation? ! Clearly a more flexible approach is necessary. ! ! Additionally spammers ! will adapt their content over time and will no longer use the word ! "Nigeria" (or the words "Lose Weight Fast", or any number of other ! common lines). Ideally the software will be able to adapt as the spam changes. !

    ! !

    So, that ! is what SpamBayes does. It compares the spam and the ham ! and calculates probabilities. For instance, for me, the word "weight" ! almost never occurs in legitimate email, but it occurs all the time in ! 'lose weight fast' spam. ! ! SpamBayes can then look at incoming email, extract the most significant ! clues and combine the probabilities to produce an overall rating of ! "spamminess". It flags the messages so that your mailer can handle ! the different message types. You might set it up so that ham goes ! straight through untouched, spam goes to a folder that you ignore (or ! delete without checking) and the unsure messages go to another folder ! which you can review for errors.

    How is SpamBayes different?

    !

    There are a number of similar projects ! to SpamBayes - most are just using the original Paul Graham algorithm. ! Examining the Graham technique with careful testing showed that it did a ! remarkably good job, but there was considerable room for improvement. ! (See the background page for more.)

    !

    The SpamBayes team tinkered with new ! algorithms, tweaking existing algorithms, and, most importantly, did ! enormous test runs, slamming tens of thousands of messages against ! each other, in an attempt to quantify whether or ! not a change to the system was beneficial.

    ! !

    The new algorithm is ! a combination of work from Gary Robinson and Tim Peters, and provides ! not just a 'spam' and 'ham' rating, but also an 'unsure' rating, for ! those messages where it can't work out how to rate the message.

    ! !

    See the background page for more, well, ! background.

    ! ! !

    The code is currently available via CVS - note that it's not quite yet suitable for non-technical end-users, but for people interested in experimenting.

    +

    There are now a couple of end-user applications available for those *************** *** 31,36 **** Applications page.

    !

    We're currently working on packaging these up for end-user use - stay ! tuned!

    Mailing list

    --- 98,105 ---- Applications page.

    ! !

    At the moment, these packages require CVS access to install, ! but we're currently working on packaging these up for easy end-user ! use - stay tuned!

    Mailing list

    *************** *** 44,45 **** --- 113,122 ---- +

    Credits

    +

    Most of the heavy lifting on this project was done by Tim Peters, with + the cast of spambayes obsessive-compulsives providing ideas, heckling, and + testing. Gary Robinson and Rob Hooft contributed valuable help on the maths + behind it all. Mark Hammond amazed the world with the Outlook2000 plug-in, + and Rich Hindle, Neale Pickett, Tim Stone worked on the end-user applications.

    +

    Thanks also to Rachel Holkner for turning Anthony's gibberish into something + closer to actual English, although all mistakes are Anthony's.

    From montanaro at users.sourceforge.net Thu Jan 16 08:22:52 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Thu Jan 16 11:22:56 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.34,1.35 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv32022 Modified Files: pop3proxy.py Log Message: typo Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** pop3proxy.py 15 Jan 2003 22:45:11 -0000 1.34 --- pop3proxy.py 16 Jan 2003 16:22:49 -0000 1.35 *************** *** 686,690 **** reviewHeader = """

    These are untrained emails, which you can use to ! train the classifier. Check the appropriate buttton for each email, then click 'Train' below. 'Defer' leaves the message here, to be trained on later. Click one of the --- 686,690 ---- reviewHeader = """

    These are untrained emails, which you can use to ! train the classifier. Check the appropriate button for each email, then click 'Train' below. 'Defer' leaves the message here, to be trained on later. Click one of the From montanaro at users.sourceforge.net Thu Jan 16 09:40:15 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Thu Jan 16 12:40:18 2003 Subject: [Spambayes-checkins] spambayes proxytrainer.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9830 Added Files: proxytrainer.py Log Message: new script - proxytrainer is the training/web part of pop3proxy. This allows people not using pop3proxy or Outlook to have a gooey training interface. --- NEW FILE: proxytrainer.py --- #!/usr/bin/env python """ A web interface to training messages. This is essentially the training part of pop3proxy.py. Typical usage is to run proxytrainer.py in the background, then feed it individual messages or mailboxes using the proxytee.py script (or something similar). Normally, proxytee.py would be inserted somewhere in your mail processing pipeline. """ # This module is part of the spambayes project, which is Copyright 2002 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Skip Montanaro mostly theft from pop3proxy" __credits__ = "Richie Hindle, Tim Peters, all the Spambayes folk." try: [...981 lines suppressed...] print >>sys.stderr, __doc__ sys.exit() elif opt == '-b': state.launchUI = True elif opt == '-d': state.useDB = True elif opt == '-p': state.databaseFilename = arg elif opt == '-l': state.proxyPorts = [int(arg)] elif opt == '-u': state.uiPort = int(arg) # Do whatever we've been asked to do... state.createWorkers() main(state.uiPort, state.launchUI) if __name__ == '__main__': sys.setrecursionlimit(100) run() From montanaro at users.sourceforge.net Thu Jan 16 09:40:52 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Thu Jan 16 13:05:04 2003 Subject: [Spambayes-checkins] spambayes setup.py,1.11,1.12 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9912 Modified Files: setup.py Log Message: + proxytrainer.py and proxytee.py Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** setup.py 14 Jan 2003 05:38:19 -0000 1.11 --- setup.py 16 Jan 2003 17:40:50 -0000 1.12 *************** *** 39,42 **** --- 39,44 ---- 'pop3graph.py', 'pop3proxy.py', + 'proxytrainer.py', + 'proxytee.py', ], packages = [ 'spambayes', ], From montanaro at users.sourceforge.net Thu Jan 16 09:38:30 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Thu Jan 16 13:09:00 2003 Subject: [Spambayes-checkins] spambayes proxytee.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9638 Added Files: proxytee.py Log Message: new script - proxytee.py pumps stdin to stdout and uploads it to proxytrainer. --- NEW FILE: proxytee.py --- #!/usr/bin/env python """ Read a message or a mailbox file on standard input, upload it to a web browser and write it to standard output. usage: %(progname)s [-h] [-n] [-s server] [-p port] Options: -h, --help - print help and exit -n, --null - suppress writing to standard output (default %(null)s) -s, --server= - provide alternate web server (default %(server)s) -p, --port= - provide alternate server port (default %(port)s) """ import sys import httplib import mimetypes import getopt from spambayes.Options import options progname = sys.argv[0] __author__ = "Skip Montanaro " __credits__ = "Spambayes gang, Wade Leftwich" try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 # appropriated verbatim from a recipe by Wade Leftwich in the Python # Cookbook: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306 def post_multipart(host, selector, fields, files): """ Post fields and files to an http host as multipart/form-data. fields is a sequence of (name, value) elements for regular form fields. files is a sequence of (name, filename, value) elements for data to be uploaded as files. Return the server's response page. """ content_type, body = encode_multipart_formdata(fields, files) h = httplib.HTTP(host) h.putrequest('POST', selector) h.putheader('content-type', content_type) h.putheader('content-length', str(len(body))) h.endheaders() h.send(body) errcode, errmsg, headers = h.getreply() return h.file.read() def encode_multipart_formdata(fields, files): """ fields is a sequence of (name, value) elements for regular form fields. files is a sequence of (name, filename, value) elements for data to be uploaded as files. Return (content_type, body) ready for httplib.HTTP instance """ BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' CRLF = '\r\n' L = [] for (key, value) in fields: L.append('--' + BOUNDARY) L.append('Content-Disposition: form-data; name="%s"' % key) L.append('') L.append(value) for (key, filename, value) in files: L.append('--' + BOUNDARY) L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) L.append('Content-Type: %s' % get_content_type(filename)) L.append('') L.append(value) L.append('--' + BOUNDARY + '--') L.append('') body = CRLF.join(L) content_type = 'multipart/form-data; boundary=%s' % BOUNDARY return content_type, body def get_content_type(filename): return mimetypes.guess_type(filename)[0] or 'application/octet-stream' def usage(*args): defaults = {} for d in args: defaults.update(d) print __doc__ % defaults def main(argv): null = False server = "localhost" port = options.html_ui_port try: opts, args = getopt.getopt(argv, "hns:p:", ["help", "null", "server=", "port="]) except getopt.Error: usage(globals(), locals()) sys.exit(1) for opt, arg in opts: if opt in ("-h", "--help"): usage(globals(), locals()) sys.exit(0) elif opt in ("-n", "--null"): null = True elif opt in ("-s", "--server"): server = arg elif opt in ("-p", "--port"): port = int(arg) if args: usage(globals(), locals()) sys.exit(1) data = sys.stdin.read() post_multipart("%s:%d"%(server,port), "/upload", [], [('file', 'message.dat', data)]) sys.stdout.write(data) if __name__ == "__main__": main(sys.argv[1:]) From anthonybaxter at users.sourceforge.net Thu Jan 16 19:36:28 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Thu Jan 16 22:36:32 2003 Subject: [Spambayes-checkins] website index.ht,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv20400 Modified Files: index.ht Log Message: note rob's feedback. Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** index.ht 16 Jan 2003 01:54:49 -0000 1.6 --- index.ht 17 Jan 2003 03:36:26 -0000 1.7 *************** *** 116,122 ****

    Most of the heavy lifting on this project was done by Tim Peters, with the cast of spambayes obsessive-compulsives providing ideas, heckling, and ! testing. Gary Robinson and Rob Hooft contributed valuable help on the maths ! behind it all. Mark Hammond amazed the world with the Outlook2000 plug-in, and Rich Hindle, Neale Pickett, Tim Stone worked on the end-user applications.

    !

    Thanks also to Rachel Holkner for turning Anthony's gibberish into something ! closer to actual English, although all mistakes are Anthony's.

    --- 116,125 ----

    Most of the heavy lifting on this project was done by Tim Peters, with the cast of spambayes obsessive-compulsives providing ideas, heckling, and ! testing. Gary Robinson provided a lot of the serious maths and theory, as ! well as his essay on "how to do it better" (see the ! background page for a link). Rob Hooft ! also contributed maths/stats clues. ! Mark Hammond amazed the world with the Outlook2000 plug-in, and Rich Hindle, Neale Pickett, Tim Stone worked on the end-user applications.

    !

    (Thanks also to Rachel Holkner for turning Anthony's gibberish into something ! closer to actual English, although all mistakes are Anthony's.)

    From tim_one at users.sourceforge.net Thu Jan 16 20:14:40 2003 From: tim_one at users.sourceforge.net (Tim Peters) Date: Thu Jan 16 23:14:43 2003 Subject: [Spambayes-checkins] website background.ht,1.9,1.10 docs.ht,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv23499/website Modified Files: background.ht docs.ht Log Message: Added some words. Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** background.ht 15 Jan 2003 03:37:02 -0000 1.9 --- background.ht 17 Jan 2003 04:14:38 -0000 1.10 *************** *** 32,35 **** --- 32,47 ---- to make little or no difference.

    +

    Because the original tests of the system mixed a ham corpus from + a high-volume mailing list with a spam corpus from a different source, + email header lines were ignored completely at first (they contained too + many consistent clues about which source a message came from). As a + result, this project tried much harder than most to find ways to extract + useful information from message bodies. For example, special + tokenizing of embedded URLs was one of the first things tried, and + instantly cut the false negative rate in half. In the end, testing + showed that very good classifiers can be gotten by looking only at + message bodies, or by looking only at message headers. Looking at both + does best, of course.

    + *************** *** 112,115 **** --- 124,139 ---- unsure messages vs possible false positives or negatives. In the chi-squared results, the "unsure" window can be quite large, and still result in very small numbers of "unsure" messages.

    + +

    A remarkable property of chi-combining is that people have generally + been sympathetic to its "Unsure" ratings: people usually agree that + messages classed Unsure really are hard to categorize. For example, + commercial HTML email from a company you do business with is quite likely + to score as Unsure the first time the system sees such a message from + a particular company. Spam and commercial email both use the language + and devices of advertising heavily, so it's hard to tell them apart. + Training quickly teaches the system all sorts of things about the + commerical email you want, though, ranging from which company sent it + and how they addressed you, to the kinds of products and services it's + offering.

    Training

    Index: docs.ht =================================================================== RCS file: /cvsroot/spambayes/website/docs.ht,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** docs.ht 16 Jan 2003 01:54:49 -0000 1.6 --- docs.ht 17 Jan 2003 04:14:38 -0000 1.7 *************** *** 41,45 **** document or corpus. (plural is hapax legomena)
    training
    The process of feeding spambayes some sample spam and ham messages, to teach it what to look for. !
    bayesian
    A form of statistical analysis used (in a form) in Paul Graham's initial "Plan for Spam" approach. Now used as a kind of catch-all term for this class of filters, no doubt horrorifying statisticians everywhere.
    --- 41,45 ---- document or corpus. (plural is hapax legomena)
    training
    The process of feeding spambayes some sample spam and ham messages, to teach it what to look for. !
    Bayesian
    A form of statistical analysis used (in a form) in Paul Graham's initial "Plan for Spam" approach. Now used as a kind of catch-all term for this class of filters, no doubt horrorifying statisticians everywhere. From anthonybaxter at users.sourceforge.net Thu Jan 16 22:19:24 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 01:19:28 2003 Subject: [Spambayes-checkins] website/download - New directory Message-ID: Update of /cvsroot/spambayes/website/download In directory sc8-pr-cvs1:/tmp/cvs-serv31045/download Log Message: Directory /cvsroot/spambayes/website/download added to the repository From anthonybaxter at users.sourceforge.net Thu Jan 16 22:26:52 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 01:26:55 2003 Subject: [Spambayes-checkins] website docs.ht,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv31553 Modified Files: docs.ht Log Message: stupid typo. Index: docs.ht =================================================================== RCS file: /cvsroot/spambayes/website/docs.ht,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** docs.ht 17 Jan 2003 04:14:38 -0000 1.7 --- docs.ht 17 Jan 2003 06:26:50 -0000 1.8 *************** *** 42,46 ****
    training
    The process of feeding spambayes some sample spam and ham messages, to teach it what to look for.
    Bayesian
    A form of statistical analysis used (in a form) in Paul Graham's ! initial "Plan for Spam" approach. Now used as a kind of catch-all term for this class of filters, no doubt horrorifying statisticians everywhere. --- 42,47 ----
    training
    The process of feeding spambayes some sample spam and ham messages, to teach it what to look for.
    Bayesian
    A form of statistical analysis used (in a form) in Paul Graham's ! initial "Plan for Spam" approach. Now used as a kind of catch-all term for ! this class of filters, no doubt horrifying statisticians everywhere. From anthonybaxter at users.sourceforge.net Thu Jan 16 22:27:36 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 01:27:38 2003 Subject: [Spambayes-checkins] website index.ht,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv31590 Modified Files: index.ht Log Message: snapshot. Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** index.ht 17 Jan 2003 03:36:26 -0000 1.7 --- index.ht 17 Jan 2003 06:27:34 -0000 1.8 *************** *** 90,94 **** via CVS - note that it's not quite yet suitable for non-technical end-users, but ! for people interested in experimenting.

    --- 90,94 ---- via CVS - note that it's not quite yet suitable for non-technical end-users, but ! for people interested in experimenting. There's also a nightly snapshot available for download.

    From anthonybaxter at users.sourceforge.net Thu Jan 16 22:42:56 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 01:42:59 2003 Subject: [Spambayes-checkins] spambayes/utilities HistToGNU.py,1.1,1.2 loosecksum.py,1.1,1.2 mboxcount.py,1.1,1.2 rebal.py,1.1,1.2 split.py,1.1,1.2 splitn.py,1.1,1.2 splitndirs.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/utilities In directory sc8-pr-cvs1:/tmp/cvs-serv32698/utilities Added Files: HistToGNU.py loosecksum.py mboxcount.py rebal.py split.py splitn.py splitndirs.py Log Message: CVS ate my brain, and all I got was this lousy T-Shirt. The merge didn't like the subdirectories that were created on the branch. Fixed now. From anthonybaxter at users.sourceforge.net Thu Jan 16 22:42:56 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 01:43:01 2003 Subject: [Spambayes-checkins] spambayes/testtools cmp.py,1.1,1.2 fpfn.py,1.1,1.2 mboxtest.py,1.1,1.2 rates.py,1.1,1.2 simplexloop.py,1.1,1.2 timtest.py,1.1,1.2weaktest.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/testtools In directory sc8-pr-cvs1:/tmp/cvs-serv32698/testtools Added Files: cmp.py fpfn.py mboxtest.py rates.py simplexloop.py table.py timcv.py timtest.py weaktest.py Log Message: CVS ate my brain, and all I got was this lousy T-Shirt. The merge didn't like the subdirectories that were created on the branch. Fixed now. From anthonybaxter at users.sourceforge.net Thu Jan 16 22:45:39 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 01:45:42 2003 Subject: [Spambayes-checkins] spambayes setup.py,1.12,1.13 pop3graph.py,1.3,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv492 Modified Files: setup.py Removed Files: pop3graph.py Log Message: off to the utilities directory for pop3graph.py Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** setup.py 16 Jan 2003 17:40:50 -0000 1.12 --- setup.py 17 Jan 2003 06:45:36 -0000 1.13 *************** *** 37,41 **** 'hammiesrv.py', 'hammiefilter.py', - 'pop3graph.py', 'pop3proxy.py', 'proxytrainer.py', --- 37,40 ---- --- pop3graph.py DELETED --- From anthonybaxter at users.sourceforge.net Thu Jan 16 22:45:39 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 01:45:44 2003 Subject: [Spambayes-checkins] spambayes/utilities pop3graph.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/utilities In directory sc8-pr-cvs1:/tmp/cvs-serv492/utilities Added Files: pop3graph.py Log Message: off to the utilities directory for pop3graph.py --- NEW FILE: pop3graph.py --- #!/usr/bin/env python """Analyse the pop3proxy's caches and produce a graph of how accurate classifier has been over time. Only really meaningful if you started with an empty database.""" from __future__ import division import sys import getopt from spambayes import mboxutils from spambayes.FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory from spambayes.Options import options def usage(): print __doc__ def main(argv): opts, args = getopt.getopt(argv, "h", ["help"]) for opt, arg in opts: if opt in ("-h", "--help"): usage() return # Create the corpuses and the factory that reads the messages. if options.pop3proxy_cache_use_gzip: messageFactory = GzipFileMessageFactory() else: messageFactory = FileMessageFactory() spamCorpus = FileCorpus(messageFactory, options.pop3proxy_spam_cache) hamCorpus = FileCorpus(messageFactory, options.pop3proxy_ham_cache) # Read in all the trained messages. allTrained = {} for corpus, disposition in [(spamCorpus, 'Yes'), (hamCorpus, 'No')]: for m in corpus: message = mboxutils.get_message(m.getSubstance()) message._pop3CacheDisposition = disposition allTrained[m.key()] = message # Sort the messages into the order they arrived, then work out a scaling # factor for the graph - 'limit' is the widest it can be in characters. keys = allTrained.keys() keys.sort() limit = 70 if len(keys) < limit: scale = 1 else: scale = len(keys) // (limit//2) # Build the data - an array of cumulative success indexed by count. count = successful = 0 successByCount = [] for key in keys: message = allTrained[key] disposition = message[options.hammie_header_name] if (message._pop3CacheDisposition == disposition): successful += 1 count += 1 if count % scale == (scale-1): successByCount.append(successful // scale) # Build the graph, as a list of rows of characters. size = count // scale graph = [[" " for i in range(size+3)] for j in range(size)] for c in range(size): graph[c][1] = "|" graph[c][c+3] = "." graph[successByCount[c]][c+3] = "*" graph.reverse() # Print the graph. print "\n Success of the classifier over time:\n" print " . - Number of messages over time" print " * - Number of correctly classified messages over time\n\n" for row in range(size): line = ''.join(graph[row]) if row == 0: print line + " %d" % count elif row == (count - successful) // scale: print line + " %d" % successful else: print line print " " + "_" * (size+2) if __name__ == '__main__': main(sys.argv[1:]) From anthonybaxter at users.sourceforge.net Thu Jan 16 22:46:58 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 01:47:01 2003 Subject: [Spambayes-checkins] spambayes MANIFEST.in,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv586 Added Files: MANIFEST.in Log Message: distutils manifest file --- NEW FILE: MANIFEST.in --- recursive-include spambayes *.py recursive-include pspam *.py *.txt *.ini *.sh recursive-include hammie *.py *.sh *.txt procmailrc recursive-include Outlook2000 *.py *.txt *.ini *.html *.bmp recursive-include utilities *.py *.txt recursive-include testtools *.py *.txt include *.txt *.py *.gif From anthonybaxter at users.sourceforge.net Thu Jan 16 22:47:37 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 01:47:39 2003 Subject: [Spambayes-checkins] spambayes/spambayes __init__.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv633/spambayes Modified Files: __init__.py Log Message: prerelease number 1 Index: __init__.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/__init__.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** __init__.py 14 Jan 2003 05:38:20 -0000 1.2 --- __init__.py 17 Jan 2003 06:47:35 -0000 1.3 *************** *** 1,3 **** # package marker. ! __version__ = '1.0a0' --- 1,3 ---- # package marker. ! __version__ = '1.0a1' From anthonybaxter at users.sourceforge.net Thu Jan 16 23:34:46 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 02:34:50 2003 Subject: [Spambayes-checkins] website download.ht,NONE,1.1 developer.ht,1.6,1.7 index.ht,1.8,1.9 links.h,1.4,1.5 style.css,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv4478 Modified Files: developer.ht index.ht links.h style.css Added Files: download.ht Log Message: update for first pre-release. --- NEW FILE: download.ht --- Title: SpamBayes: Download Author-Email: spambayes@python.org Author: spambayes

    Source Releases

    The first pre-release of version 1.0 of the spambayes project is available. Download version 1.0a1 from the sourceforge Files page as either a gzipped tarball or a zip file of the source files.

    The primary goal of this pre-release is to shake out any packaging, installation, or integration issues that might be lurking. Feedback to spambayes@python.org.

    Prerequisites:

      Either:
    • Python 2.2.2, Python 2.3a1, or a CVS build of python, or
    • Python 2.2, 2.2.1, plus the latest email package.

    Once you've downloaded and unpacked the source archive, do the regular setup.py build; setup.py install dance, then:

    • if you're running Outlook2000, change to the Outlook2000 directory and run 'addin.py'. See the 'about.html' file in the Outlook2000 directory for more.
    • otherwise, consult the INTEGRATION.txt file in the package and choose the method that suits your setup best.

    Binary Releases

    None as yet.

    CVS Access

    The code is currently available from sourceforge's CVS server - see here for more details.

    Update:

    (2003-01-14 14:04:19 - Project CVS Services) As of 2003-01-14, pserver-based CVS repository access and ViewCVS (web-based) CVS repository access have been taken offline as to stabilize CVS server performance for developers. These services will be re-enabled as soon as the underlying scalability issues have been analyzed and resolved (as soon as 2003-01-15, if possible). Additional updates will be posted to the Site Status page as they become available. Your patience is appreciated.

    Nightly snapshots

    A nightly snapshot is available: spambayes-nightly.tar.gz.

    Note that due to some Sourceforge issues, this is currently being built with a "manual cron" (i.e. when I remember). Once Sourceforge's CVS issues are resolved, this will be available automatically.

    Index: developer.ht =================================================================== RCS file: /cvsroot/spambayes/website/developer.ht,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** developer.ht 14 Nov 2002 19:56:44 -0000 1.6 --- developer.ht 17 Jan 2003 07:34:44 -0000 1.7 *************** *** 6,28 ****

    So you want to get involved?

    Running the code

    !

    First off, note that as yet, this project hasn't really produced ! code that's suitable for end-user deployment - it's more of an research ! exercise. That's not to say you can't use this stuff, but if it ! eats your email and signs you up to news.admin.net-abuse.email, don't ! come crying <wink>. !

    !

    This project works with either the absolute bleeding edge of python ! code, available from CVS on ! sourceforge, or with Python 2.2 (not 2.1.x or earlier). Note that ! you really want to be running Python 2.2.2 or Python 2.3cvs to get the ! latest email package. If you ! really plan on using an older version of Python, you'll need to ! download ! and install the email package (unpack the tarball and read the README ! file for more details).

    !

    The spambayes code itself is also available via CVS

    --- 6,20 ----

    So you want to get involved?

    Running the code

    !

    This project works with either Python 2.2.2, Python2.3 (currently in alpha), ! or on the bleeding edge of python code, ! available from CVS on ! sourceforge. It will not work on python 2.1.x or earlier, nor is it ever ! likely to do so.

    !

    If you're running Python 2.2 or 2.2.1, you'll need to separately fetch ! the latest email package. You can get ! this from ! sourceforge (you'll need version 2.4.3 or later).

    !

    The spambayes code itself is also available via CVS, or from the download page.

    Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** index.ht 17 Jan 2003 06:27:34 -0000 1.8 --- index.ht 17 Jan 2003 07:34:44 -0000 1.9 *************** *** 3,6 **** --- 3,9 ---- Author: SpamBayes + +

    News

    +

    First pre-release available. See the download page for more.

    What is SpamBayes?

    *************** *** 87,94 **** !

    The code is currently available ! via CVS - ! note that it's not quite yet suitable for non-technical end-users, but ! for people interested in experimenting. There's also a nightly snapshot available for download.

    --- 90,96 ---- !

    The code is currently available from a variety of methods from the ! downloads page. The current release is ! 1.0 prerelease 1.

    *************** *** 96,105 **** There are now a couple of end-user applications available for those excited by the bleeding edge - these are detailed on the ! Applications page.

    - -

    At the moment, these packages require CVS access to install, - but we're currently working on packaging these up for easy end-user - use - stay tuned!

    Mailing list

    --- 98,104 ---- There are now a couple of end-user applications available for those excited by the bleeding edge - these are detailed on the ! Applications page, and available as ! part of the source download.

    Mailing list

    Index: links.h =================================================================== RCS file: /cvsroot/spambayes/website/links.h,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** links.h 13 Jan 2003 03:57:58 -0000 1.4 --- links.h 17 Jan 2003 07:34:44 -0000 1.5 *************** *** 6,7 **** --- 6,11 ----
  • Developers
  • Related +

    Getting the code

    +
  • Releases +
  • Snapshots +
  • CVS access Index: style.css =================================================================== RCS file: /cvsroot/spambayes/website/style.css,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** style.css 13 Jan 2003 04:44:55 -0000 1.3 --- style.css 17 Jan 2003 07:34:44 -0000 1.4 *************** *** 6,9 **** --- 6,10 ---- margin-right: 15%; font-family: geneva, verdana, arial, "ms sans serif", sans-serif; + font-size: 12pt; } From anthonybaxter at users.sourceforge.net Thu Jan 16 23:44:34 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 02:44:37 2003 Subject: [Spambayes-checkins] website style.css,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv5269 Modified Files: style.css Log Message: make bg go whole length of page Index: style.css =================================================================== RCS file: /cvsroot/spambayes/website/style.css,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** style.css 17 Jan 2003 07:34:44 -0000 1.4 --- style.css 17 Jan 2003 07:44:32 -0000 1.5 *************** *** 29,33 **** IMG { border: 0; } ! TABLE.sidebar { background-image: url("./images/gutter.png"); } TD.normalSidebar a:link { color: #222222; } --- 29,33 ---- IMG { border: 0; } ! TABLE.sidebar,TD.sidebar { background-image: url("./images/gutter.png"); } TD.normalSidebar a:link { color: #222222; } From anthonybaxter at users.sourceforge.net Fri Jan 17 03:53:41 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 06:53:45 2003 Subject: [Spambayes-checkins] website index.ht,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv26293 Modified Files: index.ht Log Message: damn. one lousy s extra Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** index.ht 17 Jan 2003 07:34:44 -0000 1.9 --- index.ht 17 Jan 2003 11:53:39 -0000 1.10 *************** *** 91,95 ****

    The code is currently available from a variety of methods from the ! downloads page. The current release is 1.0 prerelease 1.

    --- 91,95 ----

    The code is currently available from a variety of methods from the ! downloads page. The current release is 1.0 prerelease 1.

    From anthonybaxter at users.sourceforge.net Fri Jan 17 08:20:19 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 11:20:22 2003 Subject: [Spambayes-checkins] website/images logo.png,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/website/images In directory sc8-pr-cvs1:/tmp/cvs-serv19663 Modified Files: logo.png Log Message: fix the capitalisation. *sigh* Index: logo.png =================================================================== RCS file: /cvsroot/spambayes/website/images/logo.png,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 Binary files /tmp/cvs1yXvIp and /tmp/cvsOFEWaF differ From anthonybaxter at users.sourceforge.net Fri Jan 17 08:57:42 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 11:57:48 2003 Subject: [Spambayes-checkins] website background.ht,1.10,1.11 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv27646 Modified Files: background.ht Log Message: more on tokenizing. added some stuff on testing and training. Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** background.ht 17 Jan 2003 04:14:38 -0000 1.10 --- background.ht 17 Jan 2003 16:57:36 -0000 1.11 *************** *** 44,48 **** does best, of course.

    !

    Combining and Scoring

    --- 44,69 ---- does best, of course.

    !

    Tokenizing of headers is an area of much experimentation. At the ! moment, the only source of historical information is in the source ! code, as comments. At the moment, a bunch of email headers are broken ! apart, and tagged with the name of the header and the token. Recipient ! (to/cc) headers are broken apart in a variety of ways - for instance, ! we count the number of recipients, and generate a token for the nearest ! power of two to the number. Optionally, "Recieved" lines are parsed for ! various tokens.

    ! !

    It's likely that remaining "easy wins" in the tokenizing are buried ! in some new way of tokenizing the headers. If you have ideas, try them ! out, and test them. For instance, no-one's tried ! generating a token that measures the difference in the time in the Date: ! header, and the time that the message arrived at the user's mailbox. !

    ! !

    Another issue that cropped up quite quickly was that certain types ! of clues were highly correlated. For instance, if the message was a ! HTML spam, you'd get things like "font", "table", "#FF0000" and the ! like. This caused any other HTML spam to be marked solidly as spam, ! but also unnecessarily penalised the non-spam HTML messages. In the ! end, the best results were found by stripping out most HTML clues.

    Combining and Scoring

    *************** *** 138,145 ****

    Training

    !

    TBD

    !

    Testing

    !

    TBD, describe test setup

    One big difference between spambayes and many other open source projects is that there's a large amount of testing done. Before any change to the tokenizer --- 159,190 ----

    Training

    !

    Training is the process of feeding known ham and spam into the tokenizer ! to generate the probabilities that the classifier then uses to generate it's ! decisions. A variety of approaches to training have been tried - things that ! we found: !

      !
    • the system generates remarkably good results on very few trained messages, ! although obviously, the more the better. !
    • training on an extremely large number of messages actually starts to produce ! a slightly poorer result (once you're training on thousands and thousands of ! messages). !
    • severe imbalance between the number of ham and spam being trained causes ! some problems - there's a flag 'experimental_ham_spam_imbalance_adjustment' ! that you can set in your bayescustomize.ini file to attempt to compensate for ! this. !
    • It's really important that you're careful about selecting your ! training sets to be from the same sources, where-ever possible. Several ! people got caught out by training on current spam, and old ham. The system ! very quickly picked up that anything with a 'Date:' field with a year of ! 2001 was ham. If you can't get ham and spam from the same source, make sure ! the tokenizer isn't going to pick up on the non-clues. !
    • "mistake-based training" - where you only train on messages that the ! system got wrong - results in the system taking a lot longer to get really ! good at what it does. You're better off feeding it the things it gets ! right, as well as the things it gets wrong. !
    !

    !

    Testing

    One big difference between spambayes and many other open source projects is that there's a large amount of testing done. Before any change to the tokenizer *************** *** 152,156 **** results than something that just took a brute force "just grab everything" approach.

    !

    Mailing list archives

    --- 197,210 ---- results than something that just took a brute force "just grab everything" approach.

    ! !

    The testing architecture has been in place since the start of the project. ! The most thorough test script is Tim's cross validation script, ! timcv.py. This script takes a number of sets of spam and ham. It ! trains on all but the first set of spam and ham, then fires the first set ! of spam and ham through, and checks how it goes at rating them. It then ! repeats this for each set in turn - train on all but the set being tested, ! then test the set. !

    !

    Mailing list archives

    From anthonybaxter at users.sourceforge.net Fri Jan 17 09:00:56 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Fri Jan 17 12:00:58 2003 Subject: [Spambayes-checkins] website background.ht,1.11,1.12 download.ht,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv28626 Modified Files: background.ht download.ht Log Message: more miscellaneous cleanup Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** background.ht 17 Jan 2003 16:57:36 -0000 1.11 --- background.ht 17 Jan 2003 17:00:51 -0000 1.12 *************** *** 44,54 **** does best, of course.

    !

    Tokenizing of headers is an area of much experimentation. At the ! moment, the only source of historical information is in the source ! code, as comments. At the moment, a bunch of email headers are broken ! apart, and tagged with the name of the header and the token. Recipient ! (to/cc) headers are broken apart in a variety of ways - for instance, ! we count the number of recipients, and generate a token for the nearest ! power of two to the number. Optionally, "Recieved" lines are parsed for various tokens.

    --- 44,55 ---- does best, of course.

    !

    Tokenizing of headers is an area of much experimentation. Unfortunately ! the only current source of historical information about previous experiments ! is in the source code, as comments, or in the mailing list archives. ! At the moment, a bunch of email headers are broken apart, and tagged ! with the name of the header and the token. Recipient (to/cc) headers ! are broken apart in a variety of ways - for instance, we count the ! number of recipients, and generate a token for the nearest power of ! two to the number. Optionally, "Recieved" lines are parsed for various tokens.

    Index: download.ht =================================================================== RCS file: /cvsroot/spambayes/website/download.ht,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** download.ht 17 Jan 2003 07:34:44 -0000 1.1 --- download.ht 17 Jan 2003 17:00:52 -0000 1.2 *************** *** 1,8 **** Title: SpamBayes: Download ! Author-Email: spambayes@python.org ! Author: spambayes

    Source Releases

    !

    The first pre-release of version 1.0 of the spambayes project is available. Download version 1.0a1 from the sourceforge Files page as either a gzipped tarball or a zip file of the source files.

    --- 1,8 ---- Title: SpamBayes: Download ! Author-Email: SpamBayes@python.org ! Author: SpamBayes

    Source Releases

    !

    The first pre-release of version 1.0 of the SpamBayes project is available. Download version 1.0a1 from the sourceforge Files page as either a gzipped tarball or a zip file of the source files.

    From richiehindle at users.sourceforge.net Fri Jan 17 10:25:42 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 17 13:25:45 2003 Subject: [Spambayes-checkins] spambayes INTEGRATION.txt,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv2771 Modified Files: INTEGRATION.txt Log Message: Fixed typo "buttton" - thanks to papaDoc. Index: INTEGRATION.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/INTEGRATION.txt,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** INTEGRATION.txt 10 Jan 2003 09:06:06 -0000 1.4 --- INTEGRATION.txt 17 Jan 2003 18:25:39 -0000 1.5 *************** *** 51,55 **** You also need version 2.4.3 or above of the Python "email" package. If you're running Python 2.3 (which at the time of writing is available ! from SourceForge CVS, or as the alpha version 2.3a1, available from python.org) then you already have this. If not, you can download it from http://mimelib.sf.net and install it - unpack the archive, cd to --- 51,55 ---- You also need version 2.4.3 or above of the Python "email" package. If you're running Python 2.3 (which at the time of writing is available ! from SourceForge CVS, or as the alpha version 2.3a1, available from python.org) then you already have this. If not, you can download it from http://mimelib.sf.net and install it - unpack the archive, cd to *************** *** 181,185 **** -------------------------------------------------------- ! The minimum you need too do to get started is create a bayescustomize.ini containing the following: --- 181,185 ---- -------------------------------------------------------- ! The minimum you need to do to get started is create a bayescustomize.ini containing the following: From richiehindle at users.sourceforge.net Fri Jan 17 10:29:47 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 17 13:29:49 2003 Subject: [Spambayes-checkins] spambayes/spambayes/resources - New directory Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes/resources In directory sc8-pr-cvs1:/tmp/cvs-serv5446/resources Log Message: Directory /cvsroot/spambayes/spambayes/spambayes/resources added to the repository From montanaro at users.sourceforge.net Fri Jan 17 12:02:40 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Fri Jan 17 15:02:43 2003 Subject: [Spambayes-checkins] spambayes/spambayes Corpus.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv21613 Modified Files: Corpus.py Log Message: * prevent infinite loop in __getattr__ * add a get() method * punt on malformed messages - put all in payload & let header be empty Index: Corpus.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Corpus.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** Corpus.py 14 Jan 2003 05:38:20 -0000 1.2 --- Corpus.py 17 Jan 2003 20:02:37 -0000 1.3 *************** *** 201,210 **** self.addMessage(msg) def __getitem__(self, key): '''Corpus is a dictionary''' ! amsg = self.msgs[key] ! if not amsg: amsg = self.makeMessage(key) # lazy init, saves memory self.cacheMessage(amsg) --- 201,216 ---- self.addMessage(msg) + def get(self, key, default=None): + try: + return self[key] + except KeyError: + return default + def __getitem__(self, key): '''Corpus is a dictionary''' ! amsg = self.msgs.get(key) ! if amsg is None: amsg = self.makeMessage(key) # lazy init, saves memory self.cacheMessage(amsg) *************** *** 280,284 **** if attributeName in ('hdrtxt', 'payload'): self.load() ! return getattr(self, attributeName) def load(self): --- 286,293 ---- if attributeName in ('hdrtxt', 'payload'): self.load() ! try: ! return self.__dict__[attributeName] ! except KeyError: ! raise AttributeError, attributeName def load(self): *************** *** 325,328 **** --- 334,341 ---- self.payload = bmatch.group(2) self.hdrtxt = sub[:bmatch.start(2)] + else: + # malformed message - punt + self.payload = sub + self.hdrtxt = "" def getSubstance(self): From richiehindle at users.sourceforge.net Fri Jan 17 12:21:41 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 17 15:21:46 2003 Subject: [Spambayes-checkins] spambayes OptionConfig.py,1.3,1.4 pop3proxy.py,1.35,1.36 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv29441 Modified Files: OptionConfig.py pop3proxy.py Log Message: You can now run pop3proxy.py with no POP3 servers, and just get the web interface. I'll split it into different source files at some point so that the naming is more sensible. This should let Skip use it instead of his proxytrainer.py. Time Stone's web-based configurator is now a part of the main web interface. The fact that you can run the thing without any POP3 proxies set up, and that the config page is now a part of it, means that you don't need to touch bayescustomize.ini, even when starting from scratch. Run pop3proxy.py, hit the Configuration link, enter your POP3 details, and you're away. There's a new architecture for pop3proxy and the web interface. The HTML is now all in resources/ui.html, with the pieces being pulled out and stitched together at runtime. All the socket/async code has been pulled out into a library module, so there's only application code left in pop3proxy.py (it's still a combination of web UI and POP3 proxy, which I'll address RSN). I've added a new directory 'resources' for the HTML and GIFs. These are packaged using Mike Fletcher's excellent ResourcePackage tool, but you don't need to know about that, or have ResourcePackage installed, unless you want to change the resources. I've added a new option html_ui_allow_remote_connections, which can be set to False to provide some measure of privacy (I'm loath to say 'security' for fear of bugs 8-) Index: OptionConfig.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/OptionConfig.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** OptionConfig.py 15 Jan 2003 22:39:42 -0000 1.3 --- OptionConfig.py 17 Jan 2003 20:21:02 -0000 1.4 *************** *** 7,19 **** This module implements a browser based Spambayes option file configuration utility. Users may use the pages in this application to customize the ! settings in the bayescustomize.ini file. This does not support the BAYESCUSTOMIZE environment variable. Is this even used anywhere? ! To execute this module, just invoke OptionConfig.py ! The port number is the port the http server will listen on, and defaults to ! 8000. Then point your browser at http://locahost:8000 (or whatever port you [...1005 lines suppressed...] ! bcini.remove_option(section, option) ! except ConfigParser.NoSectionError: ! pass # Already missing. ! o = open(inipath, 'wt') bcini.write(o) o.close() ! def run(port): ! httpServer = Dibbler.HTTPServer(port) ! httpServer.register(OptionsConfigurator()) ! Dibbler.run(launchBrowser=True) if __name__ == '__main__': ! if len(sys.argv) > 1: ! port = int(sys.argv[1]) ! else: ! port = 8000 ! run(port) Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** pop3proxy.py 16 Jan 2003 16:22:49 -0000 1.35 --- pop3proxy.py 17 Jan 2003 20:21:04 -0000 1.36 *************** *** 63,67 **** (this would mean using the extra X-Hammie header by default). o Add Today and Refresh buttons on the Review page. - o "There are no untrained messages to display. Return Home." --- 63,66 ---- *************** *** 91,95 **** o Online manual. o Links to project homepage, mailing list, etc. [...1512 lines suppressed...] *** 1635,1646 **** state.servers = [(args[0], int(args[1]))] ! if not state.servers or not state.servers[0][0]: ! print >>sys.stderr, \ ! ("Error: You must give a POP3 server name, either in\n" ! "bayescustomize.ini as pop3proxy_servers or on the\n" ! "command line. pop3server.py -h prints a usage message.") ! else: ! state.buildServerStrings() ! main(state.servers, state.proxyPorts, state.uiPort, state.launchUI) else: --- 1458,1463 ---- state.servers = [(args[0], int(args[1]))] ! state.buildServerStrings() ! main(state.servers, state.proxyPorts, state.uiPort, state.launchUI) else: From richiehindle at users.sourceforge.net Fri Jan 17 12:21:44 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 17 15:21:50 2003 Subject: [Spambayes-checkins] spambayes/spambayes Dibbler.py,NONE,1.1 PyMeldLite.py,NONE,1.1 Options.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv29441/spambayes Modified Files: Options.py Added Files: Dibbler.py PyMeldLite.py Log Message: You can now run pop3proxy.py with no POP3 servers, and just get the web interface. I'll split it into different source files at some point so that the naming is more sensible. This should let Skip use it instead of his proxytrainer.py. Time Stone's web-based configurator is now a part of the main web interface. The fact that you can run the thing without any POP3 proxies set up, and that the config page is now a part of it, means that you don't need to touch bayescustomize.ini, even when starting from scratch. Run pop3proxy.py, hit the Configuration link, enter your POP3 details, and you're away. There's a new architecture for pop3proxy and the web interface. The HTML is now all in resources/ui.html, with the pieces being pulled out and stitched together at runtime. All the socket/async code has been pulled out into a library module, so there's only application code left in pop3proxy.py (it's still a combination of web UI and POP3 proxy, which I'll address RSN). I've added a new directory 'resources' for the HTML and GIFs. These are packaged using Mike Fletcher's excellent ResourcePackage tool, but you don't need to know about that, or have ResourcePackage installed, unless you want to change the resources. I've added a new option html_ui_allow_remote_connections, which can be set to False to provide some measure of privacy (I'm loath to say 'security' for fear of bugs 8-) --- NEW FILE: Dibbler.py --- """ *Introduction* Dibbler is a Python web application framework. It lets you create web-based applications by writing independent plug-in modules that don't require any networking code. Dibbler takes care of the HTTP side of things, leaving you to write the application code. *Plugins and Methlets* Dibbler uses a system of plugins to implement the application logic. Each page maps to a 'methlet', which is a method of a plugin object that serves that page, and is named after the page it serves. The address `http://server/spam` calls the methlet `onSpam`. `onHome` is a reserved methlet name for the home page, `http://server/`. For resources that need a file extension (eg. images) you can use a URL such as `http://server/eggs.gif` to map to the `onEggsGif` methlet. All the registered plugins are searched for the appropriate methlet, so you can combine multiple plugins to build your application. A methlet needs to call `self.writeOKHeaders('text/html')` followed by `self.write(content)`. You can pass whatever content-type you like to `writeOKHeaders`, so serving images, PDFs, etc. is no problem. If a methlet wants to return an HTTP error code, it should call (for example) `self.writeError(403, "Forbidden")` instead of `writeOKHeaders` and `write`. If it wants to write its own headers (for instance to return a redirect) it can simply call `write` with the full HTTP response. If a methlet raises an exception, it is automatically turned into a "500 Server Error" page with a full traceback in it. *Parameters* Methlets can take parameters, the values of which are taken from form parameters submitted by the browser. So if your form says `
    ...` then your methlet should look like `def onSubscribe(self, email=None)`. It's good practice to give all the parameters default values, in case the user navigates to that URL without submitting a form, or submits the form without filling in any parameters. If you have lots of parameters, or their names are determined at runtime, you can define your methlet like this: `def onComplex(self, **params)` to get a dictionary of parameters. *Example* Here's a web application server that serves a calendar for a given year: >>> import Dibbler, calendar >>> class Calendar(Dibbler.HTTPPlugin): ... _form = '''

    Calendar Server

    ... ... Year: ... ...
    %s
    ''' ... ... def onHome(self, year=None): ... if year: ... result = calendar.calendar(int(year)) ... else: ... result = "" ... self.writeOKHeaders('text/html') ... self.write(self._form % result) ... >>> httpServer = Dibbler.HTTPServer(8888) >>> httpServer.register(Calendar()) >>> Dibbler.run(launchBrowser=True) Your browser will start, and you can ask for a calendar for the year of your choice. If you don't want to start the browser automatically, just call `run()` with no arguments - the application is available at http://localhost:8888/ . You'll have to kill the server manually because it provides no way to stop it; a real application would have some kind of 'shutdown' methlet that called `sys.exit()`. By combining Dibbler with an HTML manipulation library like PyMeld (shameless plug - see http://entrian.com/PyMeld for details) you can keep the HTML and Python code separate. *Building applications* You can run several plugins together like this: >>> httpServer = Dibbler.HTTPServer() >>> httpServer.register(plugin1, plugin2, plugin3) >>> Dibbler.run() ...so many plugin objects, each implementing a different set of pages, can cooperate to implement a web application. See also the `HTTPServer` documentation for details of how to run multiple `Dibbler` environments simultaneously in different threads. *Controlling connections* There are times when your code needs to be informed the moment an incoming connection is received, before any HTTP conversation begins. For instance, you might want to only accept connections from `localhost` for security reasons. If this is the case, your plugin should implement the `onIncomingConnection` method. This will be passed the incoming socket before any reads or writes have taken place, and should return True to allow the connection through or False to reject it. Here's an implementation of the `localhost`-only idea: >>> def onIncomingConnection(self, clientSocket): >>> return clientSocket.getpeername()[0] == clientSocket.getsockname()[0] *Advanced usage: Dibbler Contexts* If you want to run several independent Dibbler environments (in different threads for example) then each should use its own `Context`. Normally you'd say something like: >>> httpServer = Dibbler.HTTPServer() >>> httpServer.register(MyPlugin()) >>> Dibbler.run() but that's only safe to do from one thread. Instead, you can say: >>> myContext = Dibbler.Context() >>> httpServer = Dibbler.HTTPServer(context=myContext) >>> httpServer.register(MyPlugin()) >>> Dibbler.run(myContext) in as many threads as you like. *Dibbler and asyncore* If this section means nothing to you, you can safely ignore it. Dibbler is built on top of Python's asyncore library, which means that it integrates into other asyncore-based applications, and you can write other asyncore-based components and run them as part of the same application. By default, Dibbler uses the default asyncore socket map. This means that `Dibbler.run()` also runs your asyncore-based components, provided they're using the default socket map. If you want to tell Dibbler to use a different socket map, either to co-exist with other asyncore-based components using that map or to insulate Dibbler from such components by using a different map, you need to use a `Dibbler.Context`. If you're using your own socket map, give it to the context: `context = Dibbler.Context(myMap)`. If you want Dibbler to use its own map: `context = Dibbler.Context({})`. You can either call `Dibbler.run(context)` to run the async loop, or call `asyncore.loop()` directly - the only difference is that the former has a few more options, like launching the web browser automatically. *Self-test* Running `Dibbler.py` directly as a script runs the example calendar server plus a self-test. """ # Dibbler is released under the Python Software Foundation license; see # http://www.python.org/ __author__ = "Richie Hindle " __credits__ = "Tim Stone" try: import cStringIO as StringIO except ImportError: import StringIO import os, sys, re, time, traceback import socket, asyncore, asynchat, cgi, urlparse, webbrowser try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 class BrighterAsyncChat(asynchat.async_chat): """An asynchat.async_chat that doesn't give spurious warnings on receiving an incoming connection, lets SystemExit cause an exit, can flush its output, and will correctly remove itself from a non-default socket map on `close()`.""" def __init__(self, conn=None, map=None): """See `asynchat.async_chat`.""" asynchat.async_chat.__init__(self, conn) self._map = map def handle_connect(self): """Suppresses the asyncore "unhandled connect event" warning.""" pass def handle_error(self): """Let SystemExit cause an exit.""" type, v, t = sys.exc_info() if type == socket.error and v[0] == 9: # Why? Who knows... pass elif type == SystemExit: raise else: asynchat.async_chat.handle_error(self) def flush(self): """Flush everything in the output buffer.""" while self.producer_fifo or self.ac_out_buffer: self.initiate_send() def close(self): """Remove this object from the correct socket map.""" self.del_channel(self._map) self.socket.close() class Context: """See the main documentation for details of `Dibbler.Context`.""" def __init__(self, asyncMap=asyncore.socket_map): self._HTTPPort = None # Stores the port for `run(launchBrowser=True)` self._map = asyncMap _defaultContext = Context() class Listener(asyncore.dispatcher): """Generic listener class used by all the different types of server. Listens for incoming socket connections and calls a factory function to create handlers for them.""" def __init__(self, port, factory, factoryArgs, socketMap=_defaultContext._map): """Creates a listener object, which will listen for incoming connections when Dibbler.run is called: o port: The TCP/IP port to listen on o factory: The function to call to create a handler (can be a class name). o factoryArgs: The arguments to pass to the handler factory. For proper context support, this should include a `context` argument (or a `socketMap` argument for pure asyncore listeners). The incoming socket will be prepended to this list, and passed as the first argument. See `HTTPServer` for an example. o socketMap: Optional. The asyncore socket map to use. If you're using a `Dibbler.Context`, pass context._map. See `HTTPServer` for an example `Listener` - it's a good deal smaller than this description!""" asyncore.dispatcher.__init__(self, map=socketMap) self.socketMap = socketMap self.factory = factory self.factoryArgs = factoryArgs s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setblocking(False) self.set_socket(s, self.socketMap) self.set_reuse_addr() self.bind(('', port)) self.listen(5) def handle_accept(self): """Asyncore override.""" # If an incoming connection is instantly reset, eg. by following a # link in the web interface then instantly following another one or # hitting stop, handle_accept() will be triggered but accept() will # return None. result = self.accept() if result: clientSocket, clientAddress = result args = [clientSocket] + list(self.factoryArgs) self.factory(*args) class HTTPServer(Listener): """A web server with which you can register `HTTPPlugin`s to serve up your content - see `HTTPPlugin` for detailed documentation and examples. `port` specifies the TCP/IP port on which to run, defaulting to port 80. `context` optionally specifies a `Dibbler.Context` for the server. """ def __init__(self, port=80, context=_defaultContext): """Create an `HTTPServer` for the given port.""" Listener.__init__(self, port, _HTTPHandler, (self, context), context._map) self._plugins = [] context._HTTPPort = port def register(self, *plugins): """Registers one or more `HTTPPlugin`-derived objects with the server.""" for plugin in plugins: self._plugins.append(plugin) class _HTTPHandler(BrighterAsyncChat): """This is a helper for the HTTP server class - one of these is created for each incoming request, and does the job of decoding the HTTP traffic and driving the plugins.""" def __init__(self, clientSocket, server, context): # Grumble: asynchat.__init__ doesn't take a 'map' argument, # hence the two-stage construction. BrighterAsyncChat.__init__(self, map=context._map) BrighterAsyncChat.set_socket(self, clientSocket, context._map) self._server = server self._request = '' self.set_terminator('\r\n\r\n') # Because a methlet is likely to call `writeOKHeaders` before doing # anything else, an unexpected exception won't send back a 500, which # is poor. So we buffer any sent headers until either a plain `write` # happens or the methlet returns. self._bufferedHeaders = [] self._headersWritten = False # Tell the plugins about the connection, letting them veto it. for plugin in self._server._plugins: if not plugin.onIncomingConnection(clientSocket): self.close() def collect_incoming_data(self, data): """Asynchat override.""" self._request = self._request + data def found_terminator(self): """Asynchat override.""" # Parse the HTTP request. requestLine, headers = (self._request+'\r\n').split('\r\n', 1) try: method, url, version = requestLine.strip().split() except ValueError: self.pushError(400, "Malformed request: '%s'" % requestLine) self.close_when_done() return # Parse the URL, and deal with POST vs. GET requests. method = method.upper() unused, unused, path, unused, query, unused = urlparse.urlparse(url) cgiParams = cgi.parse_qs(query, keep_blank_values=True) if self.get_terminator() == '\r\n\r\n' and method == 'POST': # We need to read the body - set a numeric async_chat terminator # equal to the Content-Length. match = re.search(r'(?i)content-length:\s*(\d+)', headers) contentLength = int(match.group(1)) if contentLength > 0: self.set_terminator(contentLength) self._request = self._request + '\r\n\r\n' return # Have we just read the body of a POSTed request? Decode the body, # which will contain parameters and possibly uploaded files. if type(self.get_terminator()) is type(1): self.set_terminator('\r\n\r\n') body = self._request.split('\r\n\r\n', 1)[1] match = re.search(r'(?i)content-type:\s*([^\r\n]+)', headers) contentTypeHeader = match.group(1) contentType, pdict = cgi.parse_header(contentTypeHeader) if contentType == 'multipart/form-data': # multipart/form-data - probably a file upload. bodyFile = StringIO.StringIO(body) cgiParams.update(cgi.parse_multipart(bodyFile, pdict)) else: # A normal x-www-form-urlencoded. cgiParams.update(cgi.parse_qs(body, keep_blank_values=True)) # Convert the cgi params into a simple dictionary. params = {} for name, value in cgiParams.iteritems(): params[name] = value[0] # Find and call the methlet. '/eggs.gif' becomes 'onEggsGif'. if path == '/': path = '/Home' pieces = path[1:].split('.') name = 'on' + ''.join([piece.capitalize() for piece in pieces]) for plugin in self._server._plugins: if hasattr(plugin, name): # The plugin's APIs (`write`, etc) reflect back to us via # `plugin._handler`. plugin._handler = self try: # Call the methlet. getattr(plugin, name)(**params) if self._bufferedHeaders: # The methlet returned without writing anything other # than headers. This isn't unreasonable - it might # have written a 302 or something. Flush the buffered # headers self.write(None) except: # The methlet raised an exception - send the traceback to # the browser, unless it's SystemExit in which case we let # it go. eType, eValue, eTrace = sys.exc_info() if eType == SystemExit: ##self.shutdown(2) raise message = """

    500 Server error

    %s
    """ details = traceback.format_exception(eType, eValue, eTrace) details = '\n'.join(details) self.writeError(500, message % cgi.escape(details)) plugin._handler = None break else: self.onUnknown(path, params) # `close_when_done` and `Connection: close` ensure that we don't # support keep-alives or pipelining. There are problems with some # browsers, for instance with extra characters being appended after # the body of a POSTed request. self.close_when_done() def onUnknown(self, path, params): """Handler for unknown URLs. Returns a 404 page.""" self.writeError(404, "Not found: '%s'" % path) def writeOKHeaders(self, contentType, extraHeaders={}): """Reflected from `HTTPPlugin`s.""" # Buffer the headers until there's a `write`, in case an error occurs. timeNow = time.gmtime(time.time()) httpNow = time.strftime('%a, %d %b %Y %H:%M:%S GMT', timeNow) headers = [] headers.append("HTTP/1.1 200 OK") headers.append("Connection: close") headers.append("Content-Type: %s" % contentType) headers.append("Date: %s" % httpNow) for name, value in extraHeaders.items(): headers.append("%s: %s" % (name, value)) headers.append("") headers.append("") self._bufferedHeaders = headers def writeError(self, code, message): """Reflected from `HTTPPlugin`s.""" # Writing an error overrides any buffered headers, but obviously # doesn't want to write any headers if some have already gone. headers = [] if not self._headersWritten: headers.append("HTTP/1.0 %d Error" % code) headers.append("Connection: close") headers.append("Content-Type: text/html") headers.append("") headers.append("") self.push("%s%s" % \ ('\r\n'.join(headers), message)) def write(self, content): """Reflected from `HTTPPlugin`s.""" # The methlet is writing, so write any buffered headers first. headers = [] if self._bufferedHeaders: headers = self._bufferedHeaders self._bufferedHeaders = None self._headersWritten = True # `write(None)` just flushes buffered headers. if content is None: content = '' self.push('\r\n'.join(headers) + str(content)) class HTTPPlugin: """Base class for HTTP server plugins. See the main documentation for details.""" def __init__(self): # self._handler is filled in by `HTTPHandler.found_terminator()`. pass def onIncomingConnection(self, clientSocket): """Implement this and return False to veto incoming connections.""" return True def writeOKHeaders(self, contentType, extraHeaders={}): """A methlet should call this with the Content-Type and optionally a dictionary of extra headers (eg. Expires) before calling `write()`.""" return self._handler.writeOKHeaders(contentType, extraHeaders) def writeError(self, code, message): """A methlet should call this instead of `writeOKHeaders()` / `write()` to report an HTTP error (eg. 403 Forbidden).""" return self._handler.writeError(code, message) def write(self, content): """A methlet should call this after `writeOKHeaders` to write the page's content.""" return self._handler.write(content) def flush(self): """A methlet can call this after calling `write`, to ensure that the content is written immediately to the browser. This isn't necessary most of the time, but if you're writing "Please wait..." before performing a long operation, calling `flush()` is a good idea.""" return self._handler.flush() def close(self, flush=True): """Closes the connection to the browser. You should call `close()` before calling `sys.exit()` in any 'shutdown' methlets you write.""" if flush: self.flush() return self._handler.close() def run(launchBrowser=False, context=_defaultContext): """Runs a `Dibbler` application. Servers listen for incoming connections and route requests through to plugins until a plugin calls `sys.exit()` or raises a `SystemExit` exception.""" if launchBrowser: webbrowser.open_new("http://localhost:%d/" % context._HTTPPort) asyncore.loop(map=context._map) def runTestServer(readyEvent=None): """Runs the calendar server example, with an added `/shutdown` URL.""" import Dibbler, calendar class Calendar(Dibbler.HTTPPlugin): _form = '''

    Calendar Server

    Year:
    %s
    ''' def onHome(self, year=None): if year: result = calendar.calendar(int(year)) else: result = "" self.writeOKHeaders('text/html') self.write(self._form % result) def onShutdown(self): self.writeOKHeaders('text/html') self.write("

    OK.

    ") self.close() sys.exit() httpServer = Dibbler.HTTPServer(8888) httpServer.register(Calendar()) if readyEvent: # Tell the self-test code that the test server is up and running. readyEvent.set() Dibbler.run(launchBrowser=True) def test(): """Run a self-test.""" # Run the calendar server in a separate thread. import re, threading, urllib testServerReady = threading.Event() threading.Thread(target=runTestServer, args=(testServerReady,)).start() testServerReady.wait() # Connect to the server and ask for a calendar. page = urllib.urlopen("http://localhost:8888/?year=2003").read() if page.find('January') != -1: print "Self test passed." else: print "Self-test failed!" # Wait for a key while the user plays with his browser. raw_input("Press any key to shut down the application server...") # Ask the server to shut down. page = urllib.urlopen("http://localhost:8888/shutdown").read() if page.find('OK') != -1: print "Shutdown OK." else: print "Shutdown failed!" if __name__ == '__main__': test() --- NEW FILE: PyMeldLite.py --- r"""Lets you manipulate XML/XHTML using a Pythonic object model. `PyMeldLite` is a single Python module, _PyMeldLite.py_. It works with all versions of Python from 2.2 upwards. It is a restricted version of PyMeld (see http://www.entrian.com/PyMeld) - PyMeldLite supports only well-formed XML with no namespaces, whereas PyMeld supports virtually all XML or HTML documents. PyMeldLite is released under the PSF license whereas PyMeld is released under the Sleepycat License. PyMeld and PyMeldLite support the same API. *Features:* o Allows program logic and HTML to be completely separated - a graphical designer can design the HTML in a visual XHTML editor, without needing to deal with any non-standard syntax or non-standard attribute names. The program code knows nothing about XML or HTML - it just deals with objects and attributes like any other piece of Python code. [...1064 lines suppressed...] import doctest try: from Entrian import Coverage Coverage.start('PyMeldLite') except ImportError: Coverage = False import PyMeldLite result = doctest.testmod(PyMeldLite) if Coverage: analysis = Coverage.getAnalysis() analysis.printAnalysis() return result if __name__ == '__main__': failed, total = test() if failed == 0: # Else `doctest.testmod` prints the failures. print "All %d tests passed." % total Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** Options.py 15 Jan 2003 22:47:11 -0000 1.4 --- Options.py 17 Jan 2003 20:21:09 -0000 1.5 *************** *** 384,387 **** --- 384,388 ---- html_ui_port: 8880 html_ui_launch_browser: False + html_ui_allow_remote_connections: True [globals] *************** *** 479,482 **** --- 480,484 ---- 'html_ui': {'html_ui_port': int_cracker, 'html_ui_launch_browser': boolean_cracker, + 'html_ui_allow_remote_connections': boolean_cracker, }, 'globals': {'verbose': boolean_cracker, From montanaro at users.sourceforge.net Fri Jan 17 12:40:32 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Fri Jan 17 15:40:37 2003 Subject: [Spambayes-checkins] spambayes proxytrainer.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv5960 Modified Files: proxytrainer.py Log Message: Whole buncha changes... * slight tweak to css and table layout to make sure discard/defer/ham/spam labels and radio buttons line up * darken the stripe a bit so the alternating lines are a bit more distinct (this will probably quickly deteriorate into a matter of personal taste and display properties, but I could barely tell the difference between the "light" and "dark" lines on my Powerbook) * First cut at restricted review (no more than 20 per section) - see below for why. Can't page "next", "prev" yet. * pre-classify messages being displayed if they are currently "unsure" - this is fairly costly, hence the above view restriction - classifying them when they arrive is painful as well, because those messages may be coming from proxytee running from a local delivery agent like procmail, which you generally want to run quickly, especially when lots of mail arrives at the same time (think fetchmail, POP, etc). * allow user to view raw message contents (/view url, onView() method) * now that the __getattr__ bug has been fixed, dump the sys.setrecursionlimit() call. * delete a number leftover bits from pop3proxy's testing mode. Index: proxytrainer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/proxytrainer.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** proxytrainer.py 16 Jan 2003 17:40:13 -0000 1.1 --- proxytrainer.py 17 Jan 2003 20:40:27 -0000 1.2 *************** *** 230,235 **** font-weight: bold } .sectionbody { padding: 1em } ! .reviewheaders a { color: #000000 } ! .stripe_on td { background: #f4f4f4 } \n""" --- 230,235 ---- font-weight: bold } .sectionbody { padding: 1em } ! .reviewheaders a { color: #000000; font-weight: bold } ! .stripe_on td { background: #dddddd } \n""" *************** *** 284,287 **** --- 284,289 ---- + +
  •  ' % ( ! self.get_lightshade()) else: done_one = 1 ! print '
    ' % ( ! self.get_darkshade(), self.get_bgcolor()) print item print '
     ' % (self.getSidebarNormalAttrs()) else: done_one = 1 ! print '
    '% ( self.getSidebarHeaderAttrs()) print item print '
    ' % self.get_lightshade() print '%s%s' % (s, extra) print '
    ' % (self.getSidebarNormalAttrs()) print '%s%s' % (s, extra) print '
    ! """ upload = """ ! ! ! ! ! """ upload = """  !   !   ! """ stripe = 0 for key, message in keyedMessages: # Parse the message and get the relevant headers and the first # part of the body if we can. --- 658,677 ---- return keys, date, prior, start, end ! def appendMessages(self, lines, keyedMessages, label, startAt, howMany): """Appends the lines of a table of messages to 'lines'.""" buttons = \ ! """ ! ! ! """ stripe = 0 + i = -1 for key, message in keyedMessages: + i += 1 + if i < startAt: + continue + if i >= startAt+howMany: + break + # Parse the message and get the relevant headers and the first # part of the body if we can. *************** *** 687,706 **** text = self.trimAndQuote(text.strip(), 200, True) # Output the table row for this message. defer = ham = spam = "" ! if label == 'Spam': spam='checked' ! elif label == 'Ham': ham='checked' ! elif label == 'Unsure': defer='checked' ! subject = "%s" % (text, subject) ! radioGroup = buttons % (label, key, ! label, key, defer, ! label, key, ham, ! label, key, spam) stripeClass = ['stripe_on', 'stripe_off'][stripe] lines.append(""" ! """ % \ (stripeClass, subject, from_, radioGroup)) stripe = stripe ^ 1 --- 695,728 ---- text = self.trimAndQuote(text.strip(), 200, True) + buttonLabel = label + # classify unsure messages + if buttonLabel == 'Unsure': + tokens = tokenizer.tokenize(message) + prob, clues = state.bayes.spamprob(tokens, evidence=True) + if prob < options.ham_cutoff: + buttonLabel = 'Ham' + elif prob >= options.spam_cutoff: + buttonLabel = 'Spam' + # Output the table row for this message. defer = ham = spam = "" ! if buttonLabel == 'Spam': spam='checked' ! elif buttonLabel == 'Ham': ham='checked' ! elif buttonLabel == 'Unsure': defer='checked' ! subject = ('' ! '' ! '%s' ! '' ! '') % (text, key, label, subject) ! radioGroup = buttons % (buttonLabel, key, ! buttonLabel, key, defer, ! buttonLabel, key, ham, ! buttonLabel, key, spam) stripeClass = ['stripe_on', 'stripe_off'][stripe] lines.append(""" ! %s""" % \ (stripeClass, subject, from_, radioGroup)) stripe = stripe ^ 1 *************** *** 712,717 **** numTrained = 0 numDeferred = 0 for key, value in params.items(): ! if key.startswith('classify:'): id = key.split(':')[2] if value == 'spam': --- 734,745 ---- numTrained = 0 numDeferred = 0 + startAt = 0 + howMany = 20 for key, value in params.items(): ! if key == 'startAt': ! startAt = int(value) ! elif key == 'howMany': ! howMany = int(value) ! elif key.startswith('classify:'): id = key.split(':')[2] if value == 'spam': *************** *** 797,811 **** nextState = 'disabled' lines = [self.onReviewHeader, ! self.reviewHeader % (prior, next, priorState, nextState)] for header, label in ((options.header_spam_string, 'Spam'), (options.header_ham_string, 'Ham'), (options.header_unsure_string, 'Unsure')): if keyedMessages[header]: ! lines.append("") lines.append(self.reviewSubheader % (label, label, label, label, label)) ! self.appendMessages(lines, keyedMessages[header], label) ! lines.append("""""") lines.append("
    Messages classified as %s: From: ! Discard / ! Defer / ! Ham / ! Spam !
    Messages classified as %s: From:DiscardDeferHamSpam
    %s%s%s
    %s%s
     
     
    ") --- 825,842 ---- nextState = 'disabled' lines = [self.onReviewHeader, ! self.reviewHeader % (prior, next, ! startAt+howMany, howMany, ! priorState, nextState)] for header, label in ((options.header_spam_string, 'Spam'), (options.header_ham_string, 'Ham'), (options.header_unsure_string, 'Unsure')): if keyedMessages[header]: ! lines.append(" ") lines.append(self.reviewSubheader % (label, label, label, label, label)) ! self.appendMessages(lines, keyedMessages[header], label, ! startAt, howMany) ! lines.append(""" 
    """) lines.append("") *************** *** 853,856 **** --- 884,906 ---- self.push(body) + def onView(self, params): + msgkey = corpus = None + for key, value in params.items(): + if key == 'key': + msgkey = value + elif key == 'corpus': + corpus = value + if msgkey is not None and corpus is not None: + message = state.unknownCorpus.get(msgkey) + if message is None: + self.push("

    Can't find message %s.\n" % msgkey) + self.push("Maybe it expired.

    \n") + else: + self.push("
    ")
    +                     self.push(message.hdrtxt.replace("<", "<"))
    +                     self.push("\n")
    +                     self.push(message.payload.replace("<", "<"))
    +                     self.push("
    ") + msgkey = corpus = None # This keeps the global state of the module - the command-line options, *************** *** 878,882 **** self.unknownCache = options.pop3proxy_unknown_cache self.runTestServer = False - self.isTest = False if self.gzipCache: factory = GzipFileMessageFactory() --- 928,931 ---- *************** *** 904,937 **** print "Done." ! # Don't set up the caches and training objects when running the ! # self-test, so as not to clutter the filesystem. ! if not self.isTest: ! def ensureDir(dirname): ! try: ! os.mkdir(dirname) ! except OSError, e: ! if e.errno != errno.EEXIST: ! raise ! ! # Create/open the Corpuses. ! map(ensureDir, [self.spamCache, self.hamCache, self.unknownCache]) ! if self.gzipCache: ! factory = GzipFileMessageFactory() ! else: ! factory = FileMessageFactory() ! age = options.pop3proxy_cache_expiry_days*24*60*60 ! self.spamCorpus = ExpiryFileCorpus(age, factory, self.spamCache) ! self.hamCorpus = ExpiryFileCorpus(age, factory, self.hamCache) ! self.unknownCorpus = FileCorpus(factory, self.unknownCache) ! # Expire old messages from the trained corpuses. ! self.spamCorpus.removeExpiredMessages() ! self.hamCorpus.removeExpiredMessages() ! # Create the Trainers. ! self.spamTrainer = storage.SpamTrainer(self.bayes) ! self.hamTrainer = storage.HamTrainer(self.bayes) ! self.spamCorpus.addObserver(self.spamTrainer) ! self.hamCorpus.addObserver(self.hamTrainer) state = State() --- 953,977 ---- print "Done." ! def ensureDir(dirname): ! try: ! os.mkdir(dirname) ! except OSError, e: ! if e.errno != errno.EEXIST: ! raise ! # Create/open the Corpuses. ! map(ensureDir, [self.spamCache, self.hamCache, self.unknownCache]) ! if self.gzipCache: ! factory = GzipFileMessageFactory() ! else: ! factory = FileMessageFactory() ! age = options.pop3proxy_cache_expiry_days*24*60*60 ! self.spamCorpus = ExpiryFileCorpus(age, factory, self.spamCache) ! self.hamCorpus = ExpiryFileCorpus(age, factory, self.hamCache) ! self.unknownCorpus = FileCorpus(factory, self.unknownCache) ! # Expire old messages from the trained corpuses. ! self.spamCorpus.removeExpiredMessages() ! self.hamCorpus.removeExpiredMessages() state = State() *************** *** 949,989 **** # =================================================================== - # Test code. - # =================================================================== - - # One example of spam and one of ham - both are used to train, and are - # then classified. Not a good test of the classifier, but a perfectly - # good test of the POP3 proxy. The bodies of these came from the - # spambayes project, and I added the headers myself because the - # originals had no headers. - - spam1 = """From: friend@public.com - Subject: Make money fast - - Hello tim_chandler , Want to save money ? - Now is a good time to consider refinancing. Rates are low so you can cut - your current payments and save money. - - http://64.251.22.101/interest/index%38%30%300%2E%68t%6D - - Take off list on site [s5] - """ - - good1 = """From: chris@example.com - Subject: ZPT and DTML - - Jean Jordaan wrote: - > 'Fraid so ;> It contains a vintage dtml-calendar tag. - > http://www.zope.org/Members/teyc/CalendarTag - > - > Hmm I think I see what you mean: one needn't manually pass on the - > namespace to a ZPT? - - Yeah, Page Templates are a bit more clever, sadly, DTML methods aren't :-( - - Chris - """ - - # =================================================================== # __main__ driver. # =================================================================== --- 989,992 ---- *************** *** 1017,1020 **** if __name__ == '__main__': - sys.setrecursionlimit(100) run() --- 1020,1022 ---- From montanaro at users.sourceforge.net Fri Jan 17 13:45:26 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Fri Jan 17 16:45:29 2003 Subject: [Spambayes-checkins] spambayes/spambayes tokenizer.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv674 Modified Files: tokenizer.py Log Message: Add email address suffix length calculator. Instead of common prefixes this picks up clues from messages sent to many addresses in the same domain: To: "skip" , , , , , , Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** tokenizer.py 14 Jan 2003 05:38:20 -0000 1.2 --- tokenizer.py 17 Jan 2003 21:45:18 -0000 1.3 *************** *** 1173,1177 **** if pfx: score = (len(pfx) * len(all_addrs)) // 10 ! # After staring at pflen:* values generated from a large # number of ham & spam I saw that any scores greater # than 3 were always associated with spam. Collapsing --- 1173,1177 ---- if pfx: score = (len(pfx) * len(all_addrs)) // 10 ! # After staring at pfxlen:* values generated from a large # number of ham & spam I saw that any scores greater # than 3 were always associated with spam. Collapsing *************** *** 1182,1185 **** --- 1182,1216 ---- else: yield "pfxlen:%d" % score + + # same idea as above, but works for addresses in the same domain + # like + # To: "skip" , , + # , , + # , , + if options.summarize_email_suffixes: + all_addrs = [] + addresses = msg.get_all('to', []) + msg.get_all('cc', []) + for name, addr in email.Utils.getaddresses(addresses): + # flip address code so following logic is the same as + # that for prefixes + addr = list(addr) + addr.reverse() + addr = "".join(addr) + all_addrs.append(addr.lower()) + + if len(all_addrs) > 1: + # don't be fooled by "os.path." - commonprefix + # operates char-by-char! + sfx = os.path.commonprefix(all_addrs) + if sfx: + score = (len(sfx) * len(all_addrs)) // 10 + # Similar analysis as above regarding suffix length + # I suspect the best cutoff is probably dependent on + # how long the recipient domain is (e.g. "mojam.com" vs. + # "montanaro.dyndns.org") + if score > 5: + yield "sfxlen:big" + else: + yield "sfxlen:%d" % score # To: From montanaro at users.sourceforge.net Fri Jan 17 14:24:04 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Fri Jan 17 17:24:07 2003 Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv16620 Modified Files: Options.py Log Message: oops, forgot to check in the suffix option setup Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** Options.py 17 Jan 2003 20:21:09 -0000 1.5 --- Options.py 17 Jan 2003 22:23:59 -0000 1.6 *************** *** 114,117 **** --- 114,118 ---- # Try to capitalize on mail sent to multiple similar addresses. summarize_email_prefixes: False + summarize_email_suffixes: False # *************** *** 406,409 **** --- 407,411 ---- 'generate_long_skips': boolean_cracker, 'summarize_email_prefixes': boolean_cracker, + 'summarize_email_suffixes': boolean_cracker, 'skip_max_word_size': int_cracker, 'extract_dow': boolean_cracker, From richiehindle at users.sourceforge.net Fri Jan 17 12:21:29 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Sat Jan 18 09:31:39 2003 Subject: [Spambayes-checkins] spambayes/spambayes/resources .cvsignore,NONE,1.1 __init__.py,NONE,1.1 classify.gif,NONE,1.1 classify_gif.py,NONE,1.1 config.gif,NONE,1.1 config_gif.py,NONE,1.1 helmet.gif,NONE,1.1 helmet_gif.py,NONE,1.1 message.gif,NONE,1.1 message_gif.py,NONE,1.1 query.gif,NONE,1.1 query_gif.py,NONE,1.1 scanning__init__.py,NONE,1.1 status.gif,NONE,1.1 status_gif.py,NONE,1.1 train.gif,NONE,1.1 train_gif.py,NONE,1.1 ui_html.py,NONE,1.1ui_psp.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes/resources In directory sc8-pr-cvs1:/tmp/cvs-serv29441/spambayes/resources Added Files: .cvsignore __init__.py classify.gif classify_gif.py config.gif config_gif.py helmet.gif helmet_gif.py message.gif message_gif.py query.gif query_gif.py scanning__init__.py status.gif status_gif.py train.gif train_gif.py ui.html ui.psp ui_html.py ui_psp.py Log Message: You can now run pop3proxy.py with no POP3 servers, and just get the web interface. I'll split it into different source files at some point so that the naming is more sensible. This should let Skip use it instead of his proxytrainer.py. Time Stone's web-based configurator is now a part of the main web interface. The fact that you can run the thing without any POP3 proxies set up, and that the config page is now a part of it, means that you don't need to touch bayescustomize.ini, even when starting from scratch. Run pop3proxy.py, hit the Configuration link, enter your POP3 details, and you're away. There's a new architecture for pop3proxy and the web interface. The HTML is now all in resources/ui.html, with the pieces being pulled out and stitched together at runtime. All the socket/async code has been pulled out into a library module, so there's only application code left in pop3proxy.py (it's still a combination of web UI and POP3 proxy, which I'll address RSN). I've added a new directory 'resources' for the HTML and GIFs. These are packaged using Mike Fletcher's excellent ResourcePackage tool, but you don't need to know about that, or have ResourcePackage installed, unless you want to change the resources. I've added a new option html_ui_allow_remote_connections, which can be set to False to provide some measure of privacy (I'm loath to say 'security' for fear of bugs 8-) --- NEW FILE: .cvsignore --- *.pyc --- NEW FILE: __init__.py --- # See scanning__init__.py for how to change these resources. --- NEW FILE: classify.gif --- (This appears to be a binary file; contents omitted.) --- NEW FILE: classify_gif.py --- """Resource classify_gif (from file classify.gif)""" # written by resourcepackage: (1, 0, 0) source = 'classify.gif' package = 'spambayes.resources' data = 'GIF89a(\x00(\x00\xf7\x00\x00\x00\x7f\xf6\x04\x81\xf6\x08\x83\xf6\x0c\x85\ \xf6\x10\x87\xf7\x14\x89\xf7\x18\x8b\xf7\x1c\x8d\xf7 \x8f\xf7$\x91\xf7(\x93\ \xf7,\x95\xf80\x97\xf84\x99\xf88\x9b\xf8<\x9d\xf8@\x9f\xf8D\xa1\xf8H\xa3\xf9\ L\xa5\xf9P\xa7\xf9T\xa9\xf9X\xab\xf9\\\xad\xf9`\xaf\xf9d\xb1\xfah\xb3\xfal\ \xb5\xfap\xb7\xfat\xb9\xfax\xbb\xfa|\xbd\xfa\x80\xbf\xfb\x84\xc1\xfb\x88\xc3\ \xfb\x8c\xc5\xfb\x90\xc7\xfb\x94\xc9\xfb\x98\xcb\xfb\x9c\xcd\xfc\xa0\xcf\xfc\ \xa4\xd1\xfc\xa8\xd3\xfc\xac\xd5\xfc\xb0\xd7\xfc\xb4\xd9\xfc\xb8\xdb\xfc\xbc\ \xdd\xfd\xc0\xdf\xfd\xc4\xe1\xfd\xc8\xe3\xfd\xcc\xe5\xfd\xd0\xe7\xfd\xd4\xe9\ \xfd\xd8\xeb\xfe\xdc\xed\xfe\xe0\xef\xfe\xe4\xf1\xfe\xe8\xf3\xfe\xec\xf5\xfe\ \xf0\xf7\xfe\xf4\xf9\xff\xf8\xfb\xff\xfc\xfd\xff\xff\xff\xff\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,\ \x00\x00\x00\x00(\x00(\x00\x00\x08\xfe\x00\x81\x08\x1cH\xb0\xa0\xc1\x83\x08\ \x13*\\\xc8\xb0\xa1\xc3\x87\x10#J\x9cH\xb1\xa2\xc5\x8b\x12o\x98\xb8\xd0\xc0\ \x00\x80\x8f\x06\x18\\(q\x03\xe3\x0f\x14\r>\xaa\\\xb9\xb2\x01\x8a\x1f\x14]$\ \xf8\xd8@\x04\x8d\x13+\x03<\x18\x11!\xc0\xc7\x04-"\xfa\xd0\xf0\xf1\x01\x8c\ \x81>X\xf0\x00\x92\xc3\x04\x02\x05:n`\xf0\t \x83\x0f\x87=\x1e\x00\x08@\xe2`J\ \x03\x1ct<\x88 \x90\x06\x83\xa2;\x18f\xdd\xbab \xcc\x81\x1cTBE\xc0B\xa0\x8f\ \x0b4{,\xac\xf01\x05A\x072\x06\xde\xa0\n\xa0\xc3\t\t\x04\xe3\x02\xa0\xa0\x10\ \xc5G\x0c\x05\x05\xecx\x10B\x07\x10\n*\x0f\xec @\xf0\x87\x84\x8f(\x12"\xf8X\ \x92 \x00 \x1f\x05\x8c\\\x89\xba\xa0\x8d\x8f\x08\x12:\x06 \xe1\xad\xc0\x029F\ \xb3\x04\xa0\x993A\xcc\x00B\'\xe4\x0b\x80\x03\xc1\n\'>\xec\x06\xe0!\x05\xe2\ \x81\xca\x17/\\\x0b\x00\xc3U 0\x10\x88e\xf9@\x87\x82\xba\x02\xa3p?\xd0;]+\ \x00\x064\x04Z\xe8\xee\xe1\x00o\x0f:*\x90\x05\xa2\xe33\x80\xf1\x0e\x7f\x10\ \xdd\xba\x01GV\x04\'\xa4\xb5\x03\n\t@\xc5\x03\x08\x04|\xa4\xc1u\x0f\xc9\xf4Q\ \x00\x12\x94 \x01a\x01@@\xc2\x05\x02\xfc\x14\xd4D\'\xa5\xb4\x1cK.\xd9V\x91F\ \x1cy\x04\x92H$a\xa4\xe2\x8a,\xb6\xe8\xe2\x8b0\xc6(\xe3\x8c4\xd6h\xe3\x8d8\ \xe6\xa8\xe3\x8e+\x06\x04\x00;' ### end --- NEW FILE: config.gif --- (This appears to be a binary file; contents omitted.) --- NEW FILE: config_gif.py --- """Resource config_gif (from file config.gif)""" # written by resourcepackage: (1, 0, 0) source = 'config.gif' package = 'spambayes.resources' data = 'GIF89a(\x00(\x00\xf7\x00\x00\x00\x7f\xf6\x04\x81\xf6\x08\x83\xf6\x0c\x85\ \xf6\x10\x87\xf7\x14\x89\xf7\x18\x8b\xf7\x1c\x8d\xf7 \x8f\xf7$\x91\xf7(\x93\ \xf7,\x95\xf80\x97\xf84\x99\xf88\x9b\xf8<\x9d\xf8@\x9f\xf8D\xa1\xf8H\xa3\xf9\ L\xa5\xf9P\xa7\xf9T\xa9\xf9X\xab\xf9\\\xad\xf9`\xaf\xf9d\xb1\xfah\xb3\xfal\ \xb5\xfap\xb7\xfat\xb9\xfax\xbb\xfa|\xbd\xfa\x80\xbf\xfb\x84\xc1\xfb\x88\xc3\ \xfb\x8c\xc5\xfb\x90\xc7\xfb\x94\xc9\xfb\x98\xcb\xfb\x9c\xcd\xfc\xa0\xcf\xfc\ \xa4\xd1\xfc\xa8\xd3\xfc\xac\xd5\xfc\xb0\xd7\xfc\xb4\xd9\xfc\xb8\xdb\xfc\xbc\ \xdd\xfd\xc0\xdf\xfd\xc4\xe1\xfd\xc8\xe3\xfd\xcc\xe5\xfd\xd0\xe7\xfd\xd4\xe9\ \xfd\xd8\xeb\xfe\xdc\xed\xfe\xe0\xef\xfe\xe4\xf1\xfe\xe8\xf3\xfe\xec\xf5\xfe\ \xf0\xf7\xfe\xf4\xf9\xff\xf8\xfb\xff\xfc\xfd\xff\xff\xff\xff\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,\ \x00\x00\x00\x00(\x00(\x00\x00\x08\x9b\x00\x81\x08\x1cH\xb0\xa0\xc1\x83\x08\ \x13*\\\xc8\xb0\xa1\xc3\x87\x10#J\x9c\x08\x84\x03E\x89\x1c\x00\\\x84\x98Q\ \xe3\xc6\x86\x1d=~T\x18R\xe4\xc8\x83%M\x9e$\x98R%C\x8b \x01\xc8\x9c\xe9r!\ \x00\x98$i\xd2\x8c(\x13\'J\x9d;!\xce\xf4\xc9\x12\xe8P\x9eG\x0b\xb6L*\x94\xa6\ \xcf\xa5=\'\x02\x85\t\xf5&E\xa3\x1c\xaa\x12mj\xb4\xebV\xae]u~\xe5\x18V\xec\ \xc9\xaaLG\xa2\x1d{\x11*\xdb\x8d-\xdf~,)Wm\xd4\x95$\xeb\xe2\xdd\xcb\xb7\xaf\ \xdf\xbf\x80\x03\x0b\x1eL\xb8\xb0\xe1\x8b\x01\x01\x00;' ### end --- NEW FILE: helmet.gif --- (This appears to be a binary file; contents omitted.) --- NEW FILE: helmet_gif.py --- """Resource helmet_gif (from file helmet.gif)""" # written by resourcepackage: (1, 0, 0) source = 'helmet.gif' package = 'spambayes.resources' data = 'GIF89a"\x00\x18\x00\xf7\x00\x00BBFUSTcZR^^^kZVoe\\kkgskcwog\x87tf{wt\x89\ \x87\x87\x94~k\xa1\x83p\x9c\x94\x8e\xb4\x9d\x8d\xa5\xa5\xad\xad\xad\xad\xad\ \xad\xb5\xaf\xb7\xba\xbd\xad\xad\xc0\xbd\xb7\xb5\xbd\xca\xc0\xc3\xc8\xc6\xc6\ \xce\xc6\xd2\xde\xbd\xd6\xef\xc6\xd6\xef\xc6\xde\xef\xc6\xde\xf7\xbd\xde\xff\ \xc6\xe7\xff\xce\xa9\x8c\xd6\xaf\x91\xce\xbd\xad\xef\xcb\xad\xce\xcb\xcb\xd6\ \xda\xe2\xf7\xd6\xbd\xe4\xe1\xe4\xce\xde\xef\xd6\xde\xef\xd6\xe7\xef\xde\xe7\ \xef\xe7\xeb\xeb\xef\xef\xe7\xef\xef\xef\xf7\xf7\xef\xce\xde\xf7\xce\xe7\xf7\ \xce\xe7\xff\xd6\xe7\xf7\xd6\xe7\xff\xd6\xef\xff\xde\xe7\xf7\xde\xef\xfb\xe7\ \xef\xf7\xe7\xf7\xff\xef\xef\xf7\xef\xf7\xff\xf7\xf7\xf7\xf7\xff\xff\xff\xff\ \xf7\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00!\xf9\x04\x01\x00\x00\x1e\x00,\x00\ \x00\x00\x00"\x00\x18\x00\x00\x08\xfe\x00=\x08\x1cH\xb0\xa0\xc1\x83\x03e\xc8\ \xf0\xf0a\xc6\r\x84\x10#~`\x91\xe3\xc3\x07\x1c;>D\xdcXP\x06\x0b\x1e2b\xb8\ \xe0\xa1\x91\xa3I\x1a,|\xd0\x10\xd9c\xa1I\x8e4N\xf8\xb8a\xc3\xc5\x0f\x1a/a\ \xa6\xccQ\xf3f\xce\x8d4Z\xfc\xc8\xa1\xc2\xe6\xc3\x9f\x10E\x0eU\xc1\xe2G\xc6\ \x8d\x16K"\xfc\xb0\x82\xc7R\x99=p"\x94\xb1\xc1\x02\x86\x0c1 z\xf41\x14\x86\ \xcc\xa1R\t~\xc0\xe0@\xc1\x02\x07\x16\xc2\x1a\xa4jsh\x87\x13V{\xdcH\xeb\x01\ \x86\x05\x05\x06\x06\x140\xa0 \x02\n\x8d\x1fbth\xf8\xf1G\xd6\x0f%\xea\xea\ \x95\x81"\xc3\x86\x18\x13\x14\x0c\xd8\\\xa0\xc0\x81\xc2)P`\x88\x10\xa1\x04\ \x8b\x17?\x9c.\xdcp\xf6\x07\x0f\x0c\x0bbC\xd0,\xa0\x00\x81\xce\x9e\x158p\xb0\ \xc0\xed\x05\xab\xa9\xf7z\x88\x119u\x0b\x07\x06\x02o\x1e \xe06\x82\x03\xd0\ \x0f always re-loads from external files, otherwise ## only reloads if the file is newer than the generated .py file. # force = 1, ) # ResourcePackage license added by Richie Hindle , # since this is "Redistribution and use in source form". Note that binary # Spambayes packages don't redistribute this file or rely on ResourcePackage; # it's only used at development time (and even developers don't need it # unless they want to change the resources). Kudos to Mike Fletcher for # ResourcePackage - excellent tool! __license__ = """ ResourcePackage License Copyright (c) 2003, Michael C. Fletcher, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. The name of Michael C. Fletcher, or the name of any Contributor, may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS NOT FAULT TOLERANT AND SHOULD NOT BE USED IN ANY SITUATION ENDANGERING HUMAN LIFE OR PROPERTY. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ --- NEW FILE: status.gif --- (This appears to be a binary file; contents omitted.) --- NEW FILE: status_gif.py --- """Resource status_gif (from file status.gif)""" # written by resourcepackage: (1, 0, 0) source = 'status.gif' package = 'spambayes.resources' data = 'GIF89a(\x00(\x00\xf7\x00\x00\x00\x7f\xf6\x04\x81\xf6\x08\x83\xf6\x0c\x85\ \xf6\x10\x87\xf7\x14\x89\xf7\x18\x8b\xf7\x1c\x8d\xf7 \x8f\xf7$\x91\xf7(\x93\ \xf7,\x95\xf80\x97\xf84\x99\xf88\x9b\xf8<\x9d\xf8@\x9f\xf8D\xa1\xf8H\xa3\xf9\ L\xa5\xf9P\xa7\xf9T\xa9\xf9X\xab\xf9\\\xad\xf9`\xaf\xf9d\xb1\xfah\xb3\xfal\ \xb5\xfap\xb7\xfat\xb9\xfax\xbb\xfa|\xbd\xfa\x80\xbf\xfb\x84\xc1\xfb\x88\xc3\ \xfb\x8c\xc5\xfb\x90\xc7\xfb\x94\xc9\xfb\x98\xcb\xfb\x9c\xcd\xfc\xa0\xcf\xfc\ \xa4\xd1\xfc\xa8\xd3\xfc\xac\xd5\xfc\xb0\xd7\xfc\xb4\xd9\xfc\xb8\xdb\xfc\xbc\ \xdd\xfd\xc0\xdf\xfd\xc4\xe1\xfd\xc8\xe3\xfd\xcc\xe5\xfd\xd0\xe7\xfd\xd4\xe9\ \xfd\xd8\xeb\xfe\xdc\xed\xfe\xe0\xef\xfe\xe4\xf1\xfe\xe8\xf3\xfe\xec\xf5\xfe\ \xf0\xf7\xfe\xf4\xf9\xff\xf8\xfb\xff\xfc\xfd\xff\xff\xff\xff\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,\ \x00\x00\x00\x00(\x00(\x00\x00\x08\xfe\x00\x81\x08\x1cH\xb0\xa0\xc1\x83\x08\ \x13*\\\xc8\xb0\xa1\xc3\x87\x10#J$xC\xc4\x04\x04\x01\x00\x04@0A\xc4\x8d\x89\ \x08c@\x00@\xb2\xa4I\x00\x10b\x80\x1c\xd8#\x03\xc9\x01\x19T\xd8\xf8\x01\xe4\ \x87\r\x15\x19\x06\x90\xcc\xd0\x03\xa4\x8e\x05\x00\x04\x80\xe0\x91\x90\x07\ \x08\x01\x00\x16\xe8\x90\xa8C\x01\x00\x055\x08\x8e@\x00\x00\xc1\x08\x826\x80\ *X\xfa\xb0\x07\xd0\x07=\x07\x828\t\x82`\x8f\x07I\xc36t\xa9@\xad@\x9d&\x07\ \x14\xec\xe14\x83\xc3\x18A\xa3\x16\x84[RnA\x1bHU2\x8c\x00\xa0\xac\xc1\xb1&\r\ \x17\x1c\x1b\x81\xe1\r\x00\x03\x88\x1e\x9cZ\xf5\xeaA\x1e:?*\x14\x01\xc0\xeeJ\ .E,\x9c\x00@\xc5g *\x00LX\x98\x00\x80\x8d\x84\'\x13\xda\x00\x90`aF\x9a\x08c\ #\xfc\xa1q!I\x85\xbas\x03\xf0\r\x00\xf7\xc1\xe0\x06y\x0fWH\xf5\xb5\xf0\x92\ \xb2\xab\x8e.\r\xdbd\xc2\xd4\xab7w\xae\x0e\x1da\xe8\x85\x8f#s?\xff}93\xc3\ \x91\x8a\x0b"\'8\x16BC\x18A\x9d\xab\xb7n\x100\x00\x18\x0e1\xa45\xb8\x1e\x88W\ \x00\x18t\xf5\x95[@\xacw\xd6~\x0f\xe9`\xc0S\xf2\x15H\x9f@Y\x01`@\x0eL\x01%\ \x94d\xba\x19\x85\x94R \xf5\xa0\x1fd1\xcdT\xd3M9\x91\x84\x01\x81\x12\x89t\ \xd2\x8a)\x9dF\x91E\x18i\xc4\x91G.\xd6h\xe3\x8d8\xe6\xa8\xe3\x8e<\xf6\x98P@\ \x00\x00;' ### end --- NEW FILE: train.gif --- (This appears to be a binary file; contents omitted.) --- NEW FILE: train_gif.py --- """Resource train_gif (from file train.gif)""" # written by resourcepackage: (1, 0, 0) source = 'train.gif' package = 'spambayes.resources' data = 'GIF89a(\x00(\x00\xf7\x00\x00\x00\x7f\xf6\x04\x81\xf6\x08\x83\xf6\x0c\x85\ \xf6\x10\x87\xf7\x14\x89\xf7\x18\x8b\xf7\x1c\x8d\xf7 \x8f\xf7$\x91\xf7(\x93\ \xf7,\x95\xf80\x97\xf84\x99\xf88\x9b\xf8<\x9d\xf8@\x9f\xf8D\xa1\xf8H\xa3\xf9\ L\xa5\xf9P\xa7\xf9T\xa9\xf9X\xab\xf9\\\xad\xf9`\xaf\xf9d\xb1\xfah\xb3\xfal\ \xb5\xfap\xb7\xfat\xb9\xfax\xbb\xfa|\xbd\xfa\x80\xbf\xfb\x84\xc1\xfb\x88\xc3\ \xfb\x8c\xc5\xfb\x90\xc7\xfb\x94\xc9\xfb\x98\xcb\xfb\x9c\xcd\xfc\xa0\xcf\xfc\ \xa4\xd1\xfc\xa8\xd3\xfc\xac\xd5\xfc\xb0\xd7\xfc\xb4\xd9\xfc\xb8\xdb\xfc\xbc\ \xdd\xfd\xc0\xdf\xfd\xc4\xe1\xfd\xc8\xe3\xfd\xcc\xe5\xfd\xd0\xe7\xfd\xd4\xe9\ \xfd\xd8\xeb\xfe\xdc\xed\xfe\xe0\xef\xfe\xe4\xf1\xfe\xe8\xf3\xfe\xec\xf5\xfe\ \xf0\xf7\xfe\xf4\xf9\xff\xf8\xfb\xff\xfc\xfd\xff\xff\xff\xff\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,\ \x00\x00\x00\x00(\x00(\x00\x00\x08\xfe\x00\x81\x08\x1cH\xb0\xa0\xc1\x83\x08\ \x13*\\\xc8\xb0\xa1\xc3\x87\x10#J\x9cH\xd1a\r\n\x01\x028\x80\x00\xc0@\x89\ \x8a\x08\x7f\x80(\x00\xc2\x83\x84\x0f4\x80\xcc\x10\x00\xd2\xe0\x8f\n\x10r\ \xf8X0\x82\xc7\x88\x03\x00\x000\x88\xd1RFN\x00\x15~\x08\xbc1`\x00\x02\x18@~\ \xb0X\xd0\xb2\xc3\x08\x83\x12\x02t\xe8A\x10@C\x13\x0c\\\x1cd\x90\xb2`\n\t?n`\ \x08\xf0\xf3\xc4\xc2\x11\n:x88\xc0\x87\xcb\x08\n(\xa8p\x0b\xc4\xc5\x80\x85\ \x08f\xdc\x80p\xd0\xea\xc1\x0b\x19~\xd8\x98\x00@#K\x85~\r\x08-8\x80\xc7\xc1\ \x1f\x10\x10LX!\xf4\xc5\x01\xad\t\x15\xd8\x00Ram\xc1\x062\x10\x86x \xf3E\x08\ \x1b/\x18(\x14\xc1\x01H\x0f\x06$\n~\x08\x112\x84\x01\x01\x1b\xe8\xfaE\xc8\ \xc3\xc0\x0e :\x14\xac X#\'\x81\x08:\x16\xbah\xb0\xd0Cl 7\x0e\xbcpy\x82\x01\ \xdd\x82:`x\xc8\xbb\xd0\x06_\x815\x0c\xfe\xdc8\xc8\xa1uU\x00\x04\x1c\x84p\ \xcc\x10\x01A\x16\r\x16\x13\xfc\xd1\x80\x05A\xf7\x0cy\x84p@ g\xc1\xf2\x07\ \xdd`@\r\x03A\xb0\x99B3 \xe0\x01\x0c\xc9\x19\xe4\x03\x03\'\xc87\x90e\xe3\x01\ A\x82g\t5\x80YB:D\xd0\x1f\x00\x04\x0e\xb4\x82\x02\xc9\xed`\x00{\x08\xf9\xe5\ \xc3\x06\x02\x18\x10\x82\x84\x04\x85\xf0AA$0@\x15\x07"(\xc4\xc0\x0b6\x84\xf0\ \x82\x0f9<@\xdbA20W\x90\x07\x15\x00a\x83\x02\n\xb9 ]R+L\x80\x00\x040\n\xc4\ \xc3]\x05\xfd`\x80@\xbb!$\x80\x03dM`\xc3\x0f\x19\\\x90\xe2A\x10\xdc\x90\xe0B\ \x03`\xe6\x83\n\x14(\x10A\x95>`yd\x07\n<\xa5\xd0\t?\x05\x80\xc1\r?H\x90\x82A\ 4\xa8f\x90\x0b\x0c\x98\xd0P\x97=t\x10\x80\x04\x06\x8d\xd0AK\x0b\xb0 \x14\x0c\ \x08\x14U\xe1K?\x85\x06R\x0c\x0c\xe4t@M#,\x00$\x04A\xb5T\x90\x003\x00A\xc3\ \x07 \x12x0\x12\x08U\xb6T\x82\x01\x00@\x00f\x00\x14\x84\xa8\xea\xaf\xc0\x06+\ \xec\xb0\xc4\x16k\xacB\x01\x01\x00;' ### end --- NEW FILE: ui.html --- Spambayes User Interface

    Introduction

    This file, ui.html, defines the look-and-feel of the user interface of the Spambayes Server. The various pieces of HTML defined here are extracted and maniplulated at runtime to dynamically produce the HTML that the Spambayes Server serves up - this file acts as a palette of HTML components. PyMeldLite is the module that provides the HTML-to-object mapping layer that makes all this possible. Each piece of HTML that needs to be manipulated has an id tag, and becomes a Python object at runtime.

    As an example of how this works, here is an editbox with an id of examplebox: PyMeldLite lets you manipulate the HTML programmatically:

        >>> import PyMeldLite
        >>> html = open("ui.html", "rt").read()
        >>> doc = PyMeldLite.Container(html)
        >>> print doc.examplebox
        <input id="examplebox" size="10" type="text" value="example"/>
        >>> doc.examplebox.value = "Changed"
        >>> print doc.examplebox
        <input id="examplebox" size="10" type="text" value="Changed"/>
        

    So the Python code to build the HTML user interface pages doesn't need to faff about pasting strings together, or building HTML components from scratch in code. And the look-and-feel is controlled solely by this one HTML file - changing the stylesheet, translating into other languages, adding a new piece of user interface - all of these things are very easy.

    Below are all the user interface components along with their ids.


    headedBox

    Headed box
      This is a "headedBox". Most of the user interfaces pieces are presented in one of these. The pieces aren't presented in these boxes here in ui.html to avoid duplication of HTML.
     

    status

    POP3 proxy running on 1110, proxying to example.com.
    Active POP3 conversations: 0.
    POP3 conversations this session: 0.
    Emails classified this session: 0 spam, 0 ham, 0 unsure.
    Total emails trained: Spam: 0 Ham: 0
              You can configure your Spambayes
          system using the Configuration page.

    reviewText

    The Spambayes proxy stores all the messages it sees. You can train the classifier based on those messages using the Review messages page.


    reviewTable

    These are untrained emails, which you can use to train the classifier. Check the appropriate button for each email, then click 'Train' below. 'Defer' leaves the message here, to be trained on later. Click one of the Discard / Defer / Ham / Spam headers to check all of the buttons in that section in one go.

            
     
    Messages classified as TYPE: From: Discard / Defer / Ham / Spam
    Re: Spambayes and PyMeld rock! 8-) Richie Hindle <richie@entrian.com>      
       
        

    upload

    Either upload a message or mbox file:
    Or paste one whole message (incuding headers) here:

    (The upload form gets used for both training and classifying - the inappropriate pieces are removed at runtime.)


    wordQuery


    wordStats

    Number of spam messages: 123.
    Number of ham messages: 456.
    Probability that a message containing this word is spam: 0.789.

    classifyResults

    Spam probability: 0.123

    (The table of clues goes here, like this but in a headedBox):
    Example word 0.123

    Return Home or classify another:

    (The Classify form goes here)

    configForm

    This page allows you to change certain customizable options that control the way in which Spambayes processes your email.

    Label      

    (Help text goes here.)

    Current Value:  (value)
     

    shutdownMessage

    Shutdown. Goodbye.


    --- NEW FILE: ui.psp --- (This appears to be a binary file; contents omitted.) --- NEW FILE: ui_html.py --- """Resource ui_html (from file ui.html)""" # written by resourcepackage: (1, 0, 0) source = 'ui.html' package = 'spambayes.resources' import zlib data = zlib.decompress('x\xda\xc5[{o\xe38\x92\xff\x7f\x80\xfe\x0e\x1c-\xba\x95`\x13\xdbyt\xdf\xacc\ \x0b\xdb\x9dd6\x83\xebG\xae\xe3\xb9\xc3`0h\xd0\x12ms#\x89:\x91\x8a\xe3m\xecw\ \xbf*>$\xea\x91\x8c\xfb\xe6\x16\x97\x1e$z\xb0\x8a\xc5b=~U\xd4\xcc\xbe\xbf\ \xfat\xb9\xf8\xe5\xf6\x9a\xdc,>\xbc\'\xb7?\xbf{\xff\xd3%\t\x8e\xc7\xe3\xff:\ \xbb\x1c\x8f\xaf\x16W\xe6\xc5\xf9hrB\x16%\xcd%W\\\xe44\x1d\x8f\xaf?\x06\xd1\ \x8b\xeff\x1b\x95\xa5\xfa/\xa3\t\xfeU\\\xa5\x8c\xf0d\x1e\xe8\xab \xba+h\xb6\ \xa4;&\xc9\xcf\x92\x95\xe4\xa7\\\xb1rEc6\x1b\xeb\x01H#\xd5N_,E\xb2#_\xc9J\ \xe4jJ\xfe2yIh\xc9izD\xe4\x96KyD6,}`\x8a\xc7\xf4\x82d\xb4\\\xf3|J&\xe4\x9f/\ \xbeSt\tS\xeeA\x87\x83W\xa2\xcc`l\x8b\xc1\x8b\xef\xe84\xe5\xf9=<\x8fE*\xca)\ \xf9\xd3\x04~b\xfd\x92N\x1f8\xac\x9a%O\xbc\xdd\x88\x07XV\xf3\xee\xcd\xe4|\ \xb2Z\xe1\xbbQ\x0c\x02\xb1\\\x11j\x85;\xde2\xbe\xde\x80\x8cK\x91&f\xe2\xd1\ \x92\xe6\xb9\xa6_\xd2\xf8~]\x8a*O\x80I\x9d\xbc\xbc\xa8y\xff\xa0y_\x0c\xafs6vn;\x1b[\xe7\x87K\xbdj\ \xf4~\xbc\x08\xf43\xd4\xd4,\xe1\x0f$N\xa9\x94\xf3\xd0X|\x18\xd97\xb2\xa0\xb9\ \xa6\x007\xcd\x98\n\xa2\x19\xcf\xd6D\x96\xf1<4OFk\xbe\n\tM\xf9:\x9f\x87t)3\ \x9e$)\x0b\xc7\xd1\xab|)\x8b\x0b\x90\x038\xb4\xb8\xd9\x89\x8c\xda\xdcD\xed\ \xc9\x8aR$U\xac>\xd2\xac\x15\xa2\xe0\xf1\xe3\xcer\x9czt\xd4H(2\x86{\x15\x90M\ \xc9V0\x01\xdc\x87\xd1\r\xfc\x9e\x8d\xe9\xf04\xe0\x13\xb9\x9e\xe3\xd5Z]\x90\ \x8a\x8f0^\xb6En\xdf\x80\xa2\x8c&Qc\xc8"\xa3<\xbf4n\x17\xd8\x95\x05\xd6\r\ \x8d~g\x9b\xf3\x08\xa2\xaaY\x10\x18\x1el\xc7y\xa3\xf8"Zl\xb8$+\x9e\xb2#2\x8b\ E\xc2\xa2Z\n}wD\x12\xb6\xe29,^m\x18I\x85\xb8?\xa6yr\xbcb,5,\xc4J\xbf\xa90zs\ \x17\xbd\xdd\xd3Fuw\xac\x84@8"d\x01\x8f\x1f \x02\x8bJ\x1a\x06\x05g1\x0c\x00\ \n\x9dK\xcct\t\x84\xe5\x92A\xa4f\x84=\xaa\x92\xc6h\xd601\x04\x96\x9c\x17i\ \x95R\xfd@\x19\x16e\x95+\x9eAd\x11$\xd9\x81B!\x98\xa7\xe9\x8e\x98]dZ\x12\xcd\ [m\xa8j\xcbe\xe8\x8dpD\xe2\x1fI\xaa\x82\x1c\xc3(\xab\x16\x02s\x83o\xc1\x7f\ \xe0\xac)S\x8a9Q\rm,\xb2B\xe4\xa0n\t\x8b3\x1a\xbc\xdd}`i\xf2\x1e\\\xd1*\x91p\ \xa3\xbe\x0c\xe4\xc1\x00\x88b\x80p\x0f\x99\x13Q\xb0\xfc \xb0\xb1 8"A\xa9\x82CH8498\xdc\ \x93S"b`\xd4\x080\xc28\x05\xe1\x8a\x95\x07\xc8t_6E\ta\x05\x99\x8d\x9a\xcdu\ \xfb\x1e\xbdJ\xd5\xc5\xf0\xee\xd7;\xfc\x9c\x1d\x8c1\xee\xce\xc6<\xda\x7fI\ \x9e\x14#\xcd\r\xd6\x18\\nh\xbefI\xf0\xff\xb8$\'BwIc0\xb3V\xfah<\xf5Nh\xbb\ \xb6\x8e\x8d\x1e\xa5\xa3D\xc5!\x85\xd7\x06\xdf\t\xec\x98\xb2$\x88\xced\x1e\ \xda\xe8\x8b\xf1\x05\tW\x14\x80-]\n\x90\xbc\xa0Ra\xecB\xcc\x91\xaf1\xf8\x00\ \xf8\x03\xbf?"\xa24\x13\xe0[\xcd\xbf\x89\x9f\x86\xdb\xaa\x14\x19\x91qI\x15D3\ n\xc4\x82\xc8\xfa6O\xfa\x19\x08\xa3\x08&\xbbR\xa4)\xc8\x00n\xc6 \xec/w.\xa6\ \xc2k`m\xe6\xd1\xb1\xfc\x98\xc4\xa8&\x9c\x1c\x99i\xef\x93\x1b\xc6\xd4\x11QX\ \xbe\x80\xaf\xe3;X\xae\xb0\xd9\r\xa5\x86\x10\x9c\xaf+\\8\xc4D\xed\x88\x10\ \x0es\xb6m\x82mGI\xc7:F\x9b\x1c(Y-\rj\x023\x1a\xe4\x9a\x1daT\xee\xba\xa1\xf3\ \x1dK!V\xe2\x10\x13\xe3{i\xb5Q\x16\x8c\x10 \x07\x06N\x1c\xc8\xcb\xc18.\xeb\ \x19\x10\x08\x94\x18\xecf\x85\x83\t>\xce\x0b"\r\x87\x92w\x18[=\x99<\xf8e_\ \x07\xce\x98\x0c \xb7`\xca\x07\xe9!\x89Y\x9a\x02e\x8c\xb5I\xd8\x04\xb0e\xdc<\x80\x08\xea\x9b\xa8\xbbw\xe54\xdc\xb7\rl\x10\xbc\ \x1b\xce\xc1S\xf5\xc3\xe3\xee\n\xea]\x9e\xb6F\xdc~\xba=3U\x04\xa2\xa7\x1c#\t\ \xe8h\xb6\x8c:\xa4\xb7\x90\xba\xe5\x9d\x0e\x9eAtrr2\xb1\xfa\x98\x8d\x97\xd1\ \x91\xbf\xbd0V\x073\xd1f\xa2\x91lYs\xb0I\x04\xea\xf3\xccc4\xf2\r\xd60#48\x04r\xd4\x9b*\xa7\x1d\xb7\xf1\xa7\xa6\x9a\xe8\x0e\n~\x1c\ \x19D\xbe\xe8\xa3\xce\x04}\xce&hKC\xfd\xdc4J(\x9a\xee7\xcbu\x86\xdb`l\x87\ \xaf8&\xab=\'\xc9\xab\x0c+\x85\x0e\x7f\x02W\xd9\xd1\xf3d7}\xaa\xcd\xef\x12\ \xfd\x9c\xcb\xaad\x1d\xb2J?\xec\xaei\x81\xcb\'\xcc\xac\x0cR\x17\x96L\x9d\x95\ \xa0\xe4\xd3\xce\x1c(y\x9b\x7f\x9b\xe6\xa6O\xb2\xe9Rt$\xe9\x14\xa8\xb5\xb3`\ \xd0\xb3\xe8\xd6\xf6e\xde\xb0\xc7\xd0\x86J\xffw\x87\xc6\x8f\xde\xfe4u\xd9\ \x0f\xf6\xb2\xe2\xeb\xe7\x02\xb2\xeb\x06`O\'\xf4e\xedO\xde\xff\xfd\x0b\x00\ \xff\x18\xbb\x04z\x1aP=\x96\x02eS1v\x96\xbf\x0fK\xb9\x93\x8ae\x10\xb0-\xee\ \xe8X\x01\xb5\xfd\x023c\x18]\xda\x99M\x00E\xc0\x85\xfd\x83\x91\xaf\'/\x05\ \xd6q\xf4w\xe3\xa4\xe9\x16-\x004\xb6a\x87\xde\xe7\xe6\xa5\x8bW\x8bV\xfdn\x82\ \x96T\xa2\xac+P\xa8&M[\x0fR\x95\x02wb\xd2F\x14\xa7Bm\x97z`\xed{\x80\xfe\xa8\ \x04\x0f\xc4.\xd4F\xc8\x86\x85\xa1\xacU\xd4h\xc5\x08\x16F\x9f\xf5\xdfz<\xaaD\ \xeb\xa6\xce,{\xe2\x1c\xbbPT\xda0\xd21\x03\xb4V\xeb4\x80m\x12H\x87\x1a\x9aA\ \xd5k\xfc\xcd\xba\xdf\x11\xd9n8`\xd5\x9d]5\xe4e\xe2\xe0\xe3\x90\x06 \xf5^nX|\ \xaf\x1fC\x8d_\n(\x07\xb0\xc6\\VJ\x81^V\xc2b9\x86\xf5\xbc\x9e\xe3\x08\xc7b\ \xe3\x8a\x03Y\xb8@\xa6!T\xe6\x00\x16\x81Yx\xc5V\xac\x0c\xa1l\xa5\x0f\xa6Mc\ \xc8\xad\xa6t\xce>\xb2\xe5\xbf\x93\x1c\xa6\xc1\xb2V\xcb\xa2\x996\xd8\xc0\x10\ _q\x19\xd32!c\xa2\xb9\xc3_\x08\r\xf0\x1b\r\x82\xb8\x86#\xf0\x8c\xf5J\x1a\xb4\ k\xa8\xcdJ\xa4\x81\x11\x14mC\xa3\x1c\x07B\xd6\xa2\r|u\xd3\x9d\xea!\xf5~\x83\ \xf8` \x00\xc5\xfev\xbd\xf0\xe2\x80-\xf5u\xfd\x13nx\x92\x00p!\xd8B\x9b\x87\ \xa0E\x01j@\xf8f/Me\x14N\xfc\x18\xf0\x0c\x83\x1c\x8c\xdf\xd0\x9b\xabAr\x83,,\ \x90\x80w-\xe0\xe0\xee\xfb\xd8\xd7\xc3\x12-\td\xb5\xcc\xb8r\x12\xac\x85\'\ \xff;\xad\xc4p\xa8\xb3\x8c?V\xba[\xd4\x97\xa8\xa0$\xa3\x80l\x13.Q>`q\xe2u@[\ \x00\xc6\x84\xd9}\xa4@-\xec\'\xc4G\x18\xf9\xad\x02<\x15,\xbb\xd8\xb9\x0f\x16\ \xb1\x8a.\xb3\x16rl\xbc8.y\xe1\x96\x85\xf5\xf1\xf8\xef\xf4\x81\x9a\xa7a4\x1e\ \xcf\xbe\xff\xf5\xf2\xea\xed\xe2\xed\xaf\xb6\xd8\xacrc\x99"7\xf1\xe1\x00)\ \xf5\xd1\x10\x94\x9f\x0ba{\x14_\x1by\xf8\x8a\x1c@\xe1^e\x00\x94G(\x87$\xaf^\ \x91\xf6\x93Q\xca\xf25\xd4e\xd1\x9c\x9c\x1e6\xa4_\xdbZ\xd4V?\xef\x90\xfez\ \xf2\xdbEo\x189\xe00prA8\x99i2;\x01<\xf8\xf3\x9f\x0f\xdb\xc3\xbf\xf6\xb7J\ \x02\x1a\xd7\xadl`\x82\xd4\xbf\xf2\xdfF\xb8\xd3#\xfd\xe2 \x9c\x86\x87\x03\ \xe7\x17\xb8\xd0\x9a\xd2\xadh>\'g\xb8\xde\xfa\x05\xc8\x8b\x0fQk\xf0|\xd8L\ \xdc\x9c\xb6[2\xaf\xb5;R\xe2\xbd\xd8\xb2\xf2\x12R\xc2\xc1\xe1a\x9f\xfa\xeb\ \xf3\x0cu\xf0\x81h\x06\xf3\x97\x15\x1bX\xc3?\xdb\x8f\xbc[{i\xff\x8c\xc7\xbf\ \xfd\xd6\x14\x1e\xdaX\xf6\x8cN\xb7\x9f\xee\x16a\xd4\xad:BW\x07\x9b"\xcf?\xfd\ \xda7`@\xb4\xf0\xb2\xd1]\xb5t\xf9\xabUV\x02\x18\x8b>\xb8$\xec\x01\\*\t\x1e\ \xdbN5^\xeb\x15\xa3H\xf5c)\xb2\'^;\xb1[\xa7K\x10\x1c\xc4\xb6\xa4\x05:6\xd2\ \x0fc\x98\xa0\xf1\xb6i\xe3S!\xca\x12\x1eA\xa62Y\x05\xec-\x88\xec\xb5\xce\xe3\ \xe3\xff\x1d7\x9d\xf74/\xbc\xfa\x03\x9c \xb7i>\xf0\xf7\x0fp\xc1\xdc\xa8\xd9\ \xe0E\xfb\xc8\xc7X\xd62\x1al\x11\xb4\x1b\x1a\xde\xae\x7f\x16\xdb\xa6Q\xe0N\ \xef\xc2\x81\xc6\x02\xc4ol\xca\x07\x00\x92\xa6\x1ef\xc3\xfe\xbdi\xbf\x92R\ \xc4\xf7\xdf\x13\xf2\xc3\xf1\xe1\x13\xbd\tl\xbb}\x01\x06\x80d8#7,/\xed\ \xd8\x86\x9d\xb3\xc5\xbaZ=\xc0\xe2\xddn\xb9\xb6\xae5\x9eiVXy#\x9e\\\n<5@\xc9\ \xf4YF\x9e8\xec\xa0\x9b\x89\xc7^\x8f\x82\xe7~\x85\xda\xb4\x8bI\xc92\xf1\xa0\ \x8f\xe0\xebs\xe0\xc3\xbdk\xf0-\x94O\xffQ\xb1r7\xec\x06\xf5\xeb\xc6\x13\xf0\ \xd1\x7f\xe3\xa3\xb0\xafB|\x178M\xc1uc\x8f^\x1d\x10\x9a\xd3\xb3\xf0l\xd2\xdd\ \x85\xe1\xa0\x00\xa0\x0c\xcc\xd5\x1eg\xb9\xd3\xebdH\xe5{\xad\xf6NQ%\x87\xfb\ \r\xf5k\x87\xed>V\xd9\x12<\x16\nj\xb4\xd9\xba\xe3\xd1i\xcc\xe1\xbbXT\xf8\x99\ \xc7\xc9\xe9\xd9\x13\xed\xcf\x86\xd5\xe6iN\x9b\x9a\xd1\xf9\xeb7O0\xba-\xc5\ \x92.9\xa0\xfe\x9d)\xe9\x9b@\x12\x9bS]\xd3\xb8\xb1z\xc2\xb3\x0b\x14\xd0\x85\ \x8f\xae\xe0`R\xcb \x9a\x8c\xfe\xed\x87\xbf<1\xe1\xb7\x84Tg\xbc\x9f\x99\x848\ \xf8\x84\x9a;\x83\xbc\xd6\x8e\xeei\x14\xcd\x02Q=\xae\x15\xef\x9e\xa1\xb0Z\ \xcd\x08\x98\x8b\xa8\xcf\xbcb\xd2;\x1c\xc3\x1f\xed\x83\xa6$\x00\xfd\xeb\x11d\ -\xec!\xca\x11I\xf9=3\n[\xa2\x1dC\x8c \xf5Q\xcf\xa1\xffqQs\x96\xa1y,\xfc\xc6\ T3\xa8lF\xe8,\xda\xcf\xf8\x0e\xe9\xe1\x98\xc0\xb5f\xbb\xdf\xdcA@\xbd\xb6_n\ \xe0>\xf6K\xf7\x16\xab!\x05\xf5\x08\xbc\x14:\x94F\xfd\xeeq\x01\x00VUe\xdet\ \xff\xda\xdfP\xe1\xe9\xb1\xdbG\x08[\xfapv:\xbc\x1ff\xd0[3&0\x01\xd1\x05N\x1b\ \x12\xdd^\x1c\x0e\x7fp\xb5\x8f\xe1\xe9N\xed\x8f\xc0\xed\t\x9b\xab\xdf\xd7\ \xe6\xf6\xe3\xa7\xcf\x1f\\T\x0b\xf4a4\x13\x05\x98\xa3\xcb\xec\x01f\xf6\xa0\ \xf7\x81\x16v:\xb1\xcd\x06\xf9O7\x1au\xeb\r\x89\xa1|,\xd1\xfdH\\I%2\xfe\x0fc\ p\x85;M\xa1\xca\x9d\x907\x1f\x1am\xe9\x0e\r\xce\xb4-[-^\x88\xee\x92I\xd3\xf1\ \xd6\x9d\xc7\xd1\xef-\xec\xb2\xf9\xd6L\x0fY\xbc}\xf7\xfe\xda\x1b3|\xf26p\xd2\ \xb6\xf8\xecQ\x81\t\x9f\xb4J\xdd\xc5\x95\xc5z\x81\xb6\xd4\xc0\xafF;\xe6V\x0b\ \x99\xd2%K\x83\xe8=\xfeq\x1f\xed\xf5Oj\x17W\xedy\xba\xec~\xfax\xfb\xf3\xa2\ \xf5\xc1\x85\xf9\x0c\xe3\xf4M\xa0g\xd1I\xc4\xa6\x1fSE\x01\x0e\xcb\xbe@@\xfc\ \x92\xd1"h\xe3\xca\xeed\xbaY\xb0ai\x01\xbb\x10c\x13\x17\x01\x8e>\xf7>\x99L\ \xc2\xe8\xf9c\x85g\xd8\x99\x83\xf0\x16\xb3\xa7\x0eV\x1cb\xb8\x012\x82\x0bl\ \x1c\xc3e\xf4\x01\xf9\xe1\xf2\xf3S;w\xda\xdd\xb9f\xaf\x1c`\xd7\x9b\x18F\x97U\ Y\xe2\xf7\xc2\xff\x89Iw:\xb0\x1eo\x97\xed\x9a\x1b\x134\xb4\x9a\x14\xfc[\xe7\ \xed\xc3:\x99<\'\xa8W\xb4\xc00O\x97\xfe\xd8\xb16\xe4~L0\x1fY\xe8z\xa3\x96\ \xd5\x07\x12\x16\xd09\x83\xd8`\to\xbf\xd0\xb9\xa3\x0f,h\xf2[\xcd\xc4=\xc0\ \xd0\xd0\xdc\xb6\x02E\xc9\xf4\xc9\x0bT\x9fT\'\xaf^\xb4\xf8\x03b}6\xbc\xb1\ \xd1o\x98\x0f\x88\xd8\x12\xf0\x9b\xcf\xe77\x95J\xc46\xb7\x8d\xaa\x81\xc3\xa7\ \xce\x88\x00\x1bLw\xf6\x99\x06\x06\xe4oB$\xcb]\xf3\xad\xa1\x9d\x12\xbf.v\x9f\ \xc4\xc2u\x8d WB(\xac\xdb\x1d|\x94\xa0\xfa~\xd3\xce\xa6V{\x88\x08N\xf2r\xa8\ \x1b7\xf3j\xd5d\xf8+\xe5\xe6#\xe0\xd0\xcc\x8c\t+l\xa5\xb0&\xcc\xde\x9a\x8f\ \x88\xa9\x7f\x90\xef\x9dk\x03\x92\x96\n2p\x10}\x109lJL\xce&\xe4\xe4|:9\x9f\ \x9e\x9d\x92\xd3\xc9\xe4\xd4\xea~\xd4\x9a\xdb\xce\xa5T1\x1d\x8f\xb7\xdb\xedH\ \xba\tG\xa2\\\x8f=\x01\xf0\x1e\xa7\xef\x9c\xbb\xb6<\xb3\xb3\xcc\xd6&\xe9\x88\ \x8e\x01\xa6\xe5\xe6CX\x1a\xf5\xe1\xa8\xecA\x82\x85\xe9`\x7f\xfd\x9e\x84\x85\ \xde\xe8&\xe4\x15h\xe0\x828b,,\x8d\xb4\x16L4 \xc2\x03\xe2`)"\xd9\x99/\xce\ \xcd\xffv\xf2?\x98y\rX' ) ### end --- NEW FILE: ui_psp.py --- """Resource ui_psp (from file ui.psp)""" # written by resourcepackage: (1, 0, 0) source = 'ui.psp' package = 'spambayes.resources' data = 'Paint Shop Pro Image File\n\x1a\x00\x00\x00\x00\x00\x05\x00\x00\x00~BK\x00\ \x00\x00.\x00\x00\x00.\x00\x00\x00\x90\x01\x00\x00F\x00\x00\x00\x7fj\xbct\ \x93X<@\x02\x02\x00\x18\x00\x01\x00\x00\x00\x00\x01\x00@\x90\x02\x00\x00\x00\ \x00\x00\x02\x00\x03\x00\x00\x05~BK\x00\n\x00\x18\x00\x00\x00~FL\x00\x01\x00\ \x0e\x00\x00\x00\xc0\xc0\xc0\x00\n\x00\x00\x00\n\x00\x00\x00\x00\x00~BK\x00\ \x01\x008\x00\x00\x00~FL\x00\x01\x00\x04\x00\x00\x00\xd4\xc3\'>~FL\x00\x02\ \x00\x04\x00\x00\x00u\xe0\'>~FL\x00\x06\x00\x04\x00\x00\x00\x01\x00\x00\x00~\ FL\x00\x07\x00\x04\x00\x00\x00\x04\x04\x00\x07~BK\x00\x10\x00\xcb\x1a\x00\ \x00\x08\x00\x00\x00\x02\x00\x00\x00~BK\x00\x11\x00\x18\x00\x00\x00\x18\x00\ \x00\x00\xc8\x00\x00\x00#\x00\x00\x00\x18\x00\x03\x00\x01\x00\x00\x00\x00\ \x01\x01\x00~BK\x00\x11\x00\x18\x00\x00\x00\x18\x00\x00\x00\x90\x01\x00\x00F\ \x00\x00\x00\x18\x00\x03\x00\x01\x00\x00\x00\x00\x01\x00\x00~BK\x00\x12\x00\ \x8d\n\x00\x00\x0e\x00\x00\x00\x7f\n\x00\x000R\x00\x00\x05\x00\xff\xd8\xff\ \xe0\x00\x10JFIF\x00\x01\x01\x00\x01,\x01,\x00\x00\xff\xdb\x00C\x00\x02\x01\ \x01\x01\x01\x01\x02\x01\x01\x01\x02\x02\x02\x02\x02\x04\x03\x02\x02\x02\x02\ \x05\x04\x04\x03\x04\x06\x05\x06\x06\x06\x05\x06\x06\x06\x07\t\x08\x06\x07\t\ \x07\x06\x06\x08\x0b\x08\t\n\n\n\n\n\x06\x08\x0b\x0c\x0b\n\x0c\t\n\n\n\xff\ \xdb\x00C\x01\x02\x02\x02\x02\x02\x02\x05\x03\x03\x05\n\x07\x06\x07\n\n\n\n\ \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ \n\n\n\n\n\n\n\n\xff\xc0\x00\x11\x08\x00#\x00\xc8\x03\x01"\x00\x02\x11\x01\ \x03\x11\x01\xff\xc4\x00\x1c\x00\x01\x00\x02\x02\x03\x01\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x07\x08\x05\x06\x02\x04\t\x03\xff\xc4\x003\x10\x00\ \x01\x03\x03\x03\x03\x03\x02\x05\x02\x07\x00\x00\x00\x00\x00\x01\x02\x03\x04\ \x00\x05\x06\x07\x11\x12\x08\x13!\x14"1\x15A\x16#23B\t\x17QRWr\x81\x95\xd2\ \xff\xc4\x00\x1a\x01\x01\x00\x03\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x02\x04\x05\x01\x06\x03\xff\xc4\x00*\x11\x00\x02\x01\x04\x02\ \x01\x03\x03\x04\x03\x00\x00\x00\x00\x00\x00\x01\x02\x11\x00\x03\x04!\x05\ \x121\x06AQ"Ba\x132q\x81\x14\x15b\xff\xda\x00\x0c\x03\x01\x00\x02\x11\x03\ \x11\x00?\x00\xf7\xf2\x94\xaf::\xe3\xea\xc3\xad>\x84\xfa\xb2\xcd.\xf8\x8e3\ \xf5|7Sb\xc3\x18jf\xb6\xb9\x0cD\xb9\xb7\x024e-\xa0\x9f\x87\x82\xda$\xb0|8\n\ \x15\xb7\xcdz_Kzc3\xd5\xbc\x83\xe0\xe2\\E\xba\x10\xb2\x87=C\x90\xca\n\x83\ \xf2\x14\x97\xfe\x14\xd7\x9d\xf5/\xa8\xf1}-\x82\xb9\x99H\xcdl\xb0V*\'\xa8!\ \x88b>\x0b\x00\xbf\xcb\n\xbb\x97n\xa3\xb4\xb6\xd7\xafv\x9e\x9a\x9b\xbe\tYm\ \xd2\xda\xfd\xc1v\xf8\xdb+\xd1Fm<\xbb\x8f\x1d\xfd\x85d\x80\x94\xfc\x91\xb9\ \xd8\x0f\'{\xaae\xfd3z\x00\xcet\x9e\xfb3\xab\x9e\xa8/s\xae\x1a\x9d\x942\xe2\ \xccYrJ\x95mi\xed\x8a\xfb\xc7\x7f{\xeb\x00\x02?Ki\xf6\x81\xbe\xfb\\\xda\x8f\ \xaa08^+\x93\x18|m\xff\x00\xd6\x16\xd4\x07\xb9\xf6\xb5\xc9=\xba\x7f\xc0\xd0\ \x07\xde\t\xd83R\xf4\xd6w/\xc9\xf1\xc7+\x90\xb3\xfa%\xd8\x94O\xb9m\xc0\xeb\ \xdf\xfe\xce\xc9\x1e\xd3\x1a\x88\xa5)Us\xab\x1c\xd2\xe7\x8d\xeau\xd9Z\xee\ \xd2\x93\x83\xc7\xb3Es\x06\x87!\xe7Z\xb1\xdd\xa6s&kW7\x1a#y\x1cF\xd1\xe3\xba\ {\x0b\x1e@q\xcfjip\xdcS\xf3\x19\x9f\xa0\x8d\x06\'\xc4\xb1\xd8\x10\xab"N\xe4\ \xecB\x86b`U\xde[\x93N\'\x17\xf5\x99dLy\x804L\xb1\x83\x03P4e\x88P$\xd5\xa3\ \xa5yE\xd2\x1eC\x84O\xce3\xfc\x9a\xdb\x118L\xf9HS\xfayi\xd3+\xd4y\xf7t\\\n\ \x90[\x88\x96\xa2\x1e\xdc\xc6RK\x89)}\x1cKg\x94\x82\x9d\x9br\xbd?\xd2\xc9\ \x1a\x87/Ml2uj\x04\x18\xb9C\x96\x88\xea\xc8#\xdb\x1c+\x8e\xdc\xc2\xd8\xee\ \xa5\xb2\x7f\x88V\xff\x00r?\xc0\x91\xe4\xeez\xbb\xd2\x0f\xe9[\xe2\xd9\xbd\ \xdf\xc0\xd8\nd\xa8mC8`\'\xa9!\x8c6\xbd\xc18\xbe\x96\xf5R\xfa\x9a\xc9qk\xa7\ \x9f\x04\xb0\x80\xc5w*\xa5I\x89\x00\xae\xd7~\xc63\x8e\xba\xd4v\x94\xfb\xee\ \xa5\x08BJ\x96\xb5\xab`\x90>I?aQ\xdc\x1e\xa84\xde\xe3s\xb71\x19\x8b\x81\xb7]\ \xe4\xa9\x8bm\xf0\xa1\x91\x19\xe5%a\xb2\xae=\xce\xf2Q\xdc!\x1c\x94\xd8\x1f\ \xcb~\x1e\xfa\xde\xb2\x1b\x1c,\x9a\xc1;\x1b\xb9s\xf4\xd7\x08n\xc6\x91\xdbV\ \xca\xe0\xe2\n\x15\xb1\xfb\x1d\x89\xf3T\xf3Dt\xe7(\xb2\xea\xbd\xcb\xa0\xbdGM\ \xa1\xe9V\xebKY:\xb3\x08\xd2^\xf5\x17\x9bK\x92\x18am\xfau#\x8b.)\xc8q\xf9\ \xa9.\x04\xa1IN\xc1\xc1\xcbz|\x0f\x17\xc7\xf2\x18\xb9\x17/\xb1\x9b`1\x03\xc8\ M\x86q\xf2T\xf5\xd6\xf4|\x1d\x95\xb5\xcd\xf2Y\xf896-\xd9Q\x17\t\x00\x9fw\xd1\ T?\x01\x87m\xfc\x8f#A\xae})Q\xe7V7\xbb\xce9\xd3fkz\xb0\xdc\x9f\x84\xfb\x16\ \x07\xf9\xcf\x8c\xad\x9c\x86\xc9\x1c^\x90\x83\xfcT\xdbEn\x03\xf6(\xdf\xedXXx\ \xcd\x99\x99o\x1dL\x17`\xb3\xf1&+o/!q1n_"B)h\xfe\x04\xd7C \xea\xf7I\xed\x17\ \x95\xd9\xac\xb0r\x1c\x87\xb3=P^\x99\x8e\xe3\xef\xca\x8a%$\x90\xb8\xe8x\x00\ \x87\xddI\x04)\xb6K\x8bI\x04\x10\x08\xda\xb7]<\xd4\xac\'Uq\xff\x00\xc4\xd8%\ \xf13c!\xf5\xc7\x90\x85\xb2\xb6^\x8a\xfa6\xe6\xcb\xcc\xba\x94\xb8\xc3\xa9\ \xdcn\xdb\x89J\x86\xe3q\xe4U[\xeb\xfb\x15\xd1,S.\xe9\xaf\r\x99\xa4q.0\x13\ \xa90l\xd8\xea#]\x1f\x88\xdd\xad\x97\x1c\x8c\x8eHK$\x07@\xe0\xd9\xd9_\xe4\ \x1e|\xaby\xbb\xd3\xc4\xb1u\x9c\xcbx\xe2R\xdf\xe2\x1d:\x97+*m\xaf\x01OD\x9b\ \r\xab{\xcb\x03\xf9\xa9\xb9\x13\x90\x15\xf2\xa4\xb0\x06\xe46\x00\xf5Y\xbc/\ \x15\xfe\xa2\xc6F8\xb8\xadqn8,W\xac[b\x081\xbd\x80H\xd2\xc3BC\xcfq\xe6p\xf9~\ K\xfd\xa5\xfb\x17\xca2\xdbkjB\x86\x99\xb8\xa0\x83\xbdh\x90\x0e\xdaVZV:\x99N\ \xb5\x9f\xefN\x8e\x7f\xab8\xcf\xfd\xf4\x7f\xfd\xd6\xcdPGM\xda\xa4\xee\x9e\ \xf4\x8b\xa3\xb1N\x99\xe5w\xb4I\xd2\xeb\x12\xd3\'\x1e\xb5\xa6R\x10D\x06\x01B\ \xc0p)\x07\xc8>F\xc4\x1f\x04\x90\xa00pp\x17+\x19\xeeAb\x19T\x00@\x9e\xc1\xc9\ 2A\xf1\xd3\xc7\xe7\xf1[y\xb9\xc7\x1b!-\xc8\x00\xab1$\x13\xfbJ\x00 \x11\xe7\ \xb7\x9f\xc5Jlk\x1e\x91Iy\x11\xa3j\xa66\xe3\x8e(%\xb6\xd1|\x8eT\xa5\x13\xb0\ \x00\x05\xf95\xb2UC\xea[A\xb1=RE\xdb\xacWo\x1a\x9da\xba\xdaS\x1e\x03x\x95\ \xeaCqm\xaf\xa7v\xd9\xee\xaa/\x05)C\x8b\xe4\x85w\x07\xe66\x0e\xde\xdd\x8d\ \xbc\xab\x1c\xbf\x1b\x83\x85\x8bb\xee=\xc2\xc5\xbb\x07\x04~\xc7P\xa4\xa8?p\ \x86\x1b\xd0\xaa\xfcW!\x9b\x99\x93z\xdd\xf4\n\x17\xa9B\x0f\xeeV,\x03\x11\xf6\ \xfe\xd3\xaa\xd5\xf5\x1bR\x9b\xd3\xb96\xa36\xd3\xdc\x87:J\x916r\xdf(DF\xd2\ \x01R\xfc%\\\x88I.\x10x\x80\xd3/,\xab\xf2\xf8\xabgJ\x92\xa4\x85%@\x827\x04\ \x1f\x9a\xae}Tj\xbeI\xa2z\xafo\xcc\xee\xc8y\xcb\xbf\xe3\xa0m\x8d\x19\x96\x92A\x0b\xe0H\xd7\x92tI\x00\xc1\xab\x1dJR\ \xb0\xebj\x94\xa5)JWV\xe9c\xb2_=?\xd6\xac\xf1fz9H\x93\x13\xd5GK\x9d\x87\xd1\ \xbf\x17Q\xc8\x1e+\x1b\x9d\x94<\x8d\xce\xc6\xbbT\xae\xab2\x99S\x06\xb8\xca\ \xac \x89\x14\xa5)\\\xae\xd2\xb8H\x8f\x1e[\x0b\x8b)\x84:\xd3\x89)q\xb7\x12\ \x14\x95\x03\xf2\x08?"\xb9\xd2\x80\x90i\xe6\xb1\xb6\x0c3\x0f\xc5\x14\xea\xf1\ |R\xdbm/~\xf1\x81\x05\xb6{\x9f\xee\xe0\x06\xff\x00\xf3Y*R\xa4\xee\xf7\x1b\ \xb3\x99?\x9a\x8a"[^\xaa ~)Q\xb5\xdb\xa7v.\x1dW\xd9\xfa\xa6\x89\x96.<\x8bv\ \x15\'\x1c\x97h\x10\x82\x84\xc6\\\x90\x99\x08Yw\x98\xe1\xc1a^\xde\'}\xc7\x91\ \xb7\x99&\x95c\x173\'\t\x9c\xd9h\xec\xac\xa7@\xca\xb0\x827>G\xbf\x91\xe4A\ \xaf\x86N&>b\xa8\xbc\xb3\xd5\x83\x0f"\x19L\x83\xaf\x83\xfd\x1f\x07T\xaf\x94\ \xd8P\xeeP\xdd\xb7\\b6\xfcy\r)\xb7\xd8y\x01Hq\n\x1b)*\x07\xc1\x04\x12\x08?5\ \xf5\xa5V\x04\x83"\xac\x10\x08\x83U\xb7S\xfa\x04\xbe\xe6\x17<1\xacW[\xd1\x16\ \xcb\xa7\xf9\x137\x9cB\xd5\x92c\xce\\\x97jy\xa5%M\xb2\xd3\xedL\x8c\xa5\xb2\ \x8e\t\tC\xe1\xe2\x90\x02y\x14\x80\x04\xc7\xa5\x9aC\x0bM\xdd\xb9_\xeeY\x1c\ \xec\x83#\xbe-\xb5\xdf2+\xa0@zHl(4\xca\x10\xdaR\x86Xl)|\x1a@\x00sZ\x8f%\xadk\ V\xdfJ\xd9\xcc\xf5\x0f/\x9f\x88\xb8\xd7\xeeJ,\xeb\xaa\x83\xb3\xd8\xc9\x00\ \x13-\xf5\x19&[\xea2wY8\x9c\x0f\x15\x83\x94\xd9\x16m\xc3\x98\xfb\x98\x8d\x0e\ \xa2\x01$\x08_\xa4@\x10\xbfH\xd6\xa9QV\x07\xa0\x19\x96\x1f\xa3\x98\xb6\x8b\ \xde2\xbc>\xfdo\xc5lPm\xb1\\\xbb\xe0\xae;\xdd\xf4\xd1\xd2\xc2^(T\xe2\x94\xac\ \xa5\'\xe3\xe3\x91\x15*\xd2\xa8c\xe7d\xe2\xdbd\xb4@\x04\x82t\x0e\xd4\x10\x0e\ \xc1"\x03\x1f\x1f;\xf6\xab\xb9\x18X\xd97\x03\xdc\x12@#\xc9\x1a0H\xd1\x13=G\ \x9f\x8f\xe6\xa2\xcc\xbf@r\xcc\x9fN\xe7\xe9\x95\xbb\'\xc4\xac\x96\xdb\x8b\ \xed;)6l\x1d\xc6\tR\x1dmd\x80&\xf1\xe4CiO"\x0f\x8d\xbev\xda\xa5:R\x99\x19\ \xd99H\x12\xe9\x90\t>\x00\xdb@\'@L\xf5\x1e~)\x8f\x85\x8d\x8a\xe5\xed\x88$\ \x01\xe4\x9d\t l\x98\x8e\xc7\xc7\xcd)JUJ\xb5J\xc3\xdd\xf0lz\xf9\x96\xd9\xf3k\ \x84w\x15>\xc6\x1f\xfaz\xd0\xf1JGy\x1c\x16T\x07\xeb\xf6\xee\x00;\x81\xc8\x9d\ \xb7\xf3Y\x8aT\xed\xdc\xb9i\xa5\x0c\x18#_\x04A\x1f\xd8$\x1a\x83\xdbK\x82\x1c\ H\x90\x7f\xb0d\x1f\xe8\x89\x14\xa5)P\xa9\xd2\x94\xa5)JR\x94\xa5)JR\x94\xa5)J\ R\x94\xa5)JR\x94\xa5)JR\x94\xa5)JR\x94\xa5)JR\x94\xa5)JR\x94\xa5)JR\x94\xa5)\ JR\x94\xa5\x7f\xff\xd9~BK\x00\x12\x00\xde\x0f\x00\x00\x0e\x00\x00\x00\xd0\ \x0f\x00\x00HH\x01\x00\x08\x00\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\ \x01,\x01,\x00\x00\xff\xdb\x00C\x00\x05\x03\x04\x04\x04\x03\x05\x04\x04\x04\ \x05\x05\x05\x06\x07\x0c\x08\x07\x07\x07\x07\x0f\x0b\x0b\t\x0c\x11\x0f\x12\ \x12\x11\x0f\x11\x11\x13\x16\x1c\x17\x13\x14\x1a\x15\x11\x11\x18!\x18\x1a\ \x1d\x1d\x1f\x1f\x1f\x13\x17"$"\x1e$\x1c\x1e\x1f\x1e\xff\xdb\x00C\x01\x05\ \x05\x05\x07\x06\x07\x0e\x08\x08\x0e\x1e\x14\x11\x14\x1e\x1e\x1e\x1e\x1e\x1e\ \x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\ \x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\ \x1e\x1e\x1e\x1e\x1e\x1e\xff\xc0\x00\x11\x08\x00F\x01\x90\x03\x01"\x00\x02\ \x11\x01\x03\x11\x01\xff\xc4\x00\x1c\x00\x01\x00\x03\x00\x03\x01\x01\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x05\x06\x07\x01\x03\x08\x02\x04\xff\xc4\x00\ F\x10\x00\x01\x04\x02\x01\x02\x03\x05\x03\x06\x08\x0f\x01\x00\x00\x00\x01\ \x00\x02\x03\x04\x05\x11\x06\x12!\x07\x131\x14\x15"AQ\x162a#Bq\x81\x91\xc1\ \x083CV\x95\xa1\xb2\xd2\x17$&78FRbfuv\x84\xa5\xb1\xb4\xc2\xff\xc4\x00\x1b\ \x01\x01\x00\x03\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\ \x05\x06\x03\x02\x01\x07\xff\xc4\x001\x11\x00\x02\x01\x02\x04\x04\x04\x06\ \x02\x02\x03\x00\x00\x00\x00\x00\x00\x01\x02\x03\x11\x04!1Q\x05\x12Aa\x13"q\ \xa12R\x81\x91\xb1\xf0\x14B\x06\xc1\xd1\xe1\xe2\xff\xda\x00\x0c\x03\x01\x00\ \x02\x11\x03\x11\x00?\x00\xf6Z" \x08\x88\x80"\x80\xe7|\xb3\x15\xc3\xb0o\xc9\ \xe4\xe4\xd9;l\x104\xfcs?\xfd\x96\xfe\xf3\xf2U\xff\x00\x03y>O\x97q\xdc\xa6c(\ \xf6\xf9\x8e\xca\xc8\xc8\xa3o\xdd\x8a1\x14D0~\x03g\xf4\x92J\x9b\x1c\x05ia\ \xa5\x8a\xb5\xa0\x9aW\xdd\xf6"K\x1dF8\x85\x86\xbf\x9d\xab\xdbe\xdc\xbf\xa2"\ \x84K\x08\x88\x80""\x00\x88\x88\x02" \x08\x8b\xe2yc\x82\t\'\x99\xed\x8e8\xda\ ^\xf78\xe84\x01\xb2J%p\xf2>\xd1R(\xe7\xf2\x1c\xa4\xcd6.\xf4\x98\x9ct\x10\xb6\ ^\xb8\xe1l\x96g\xea`\x90t\x875\xc1\xa0F\xf8\xdc~\x07\x1d\xc8\x07mw\xe3\x8d\ \xf2[q\xe4\xebR\xc8^fB\xa5\xc0\xd3Z\xdf\xc1\xd5\xf1o\xa0\x971\xadk\xda\xe2\ \xd77}--p\x00\x83\xd4\xd2\xa7\xcb\x87U\x8aw\xb5\xd6\xab;\xfe-\xa7\x7fr\x12\ \xc7\xd2mZ\xf6z>\x9f\xf3\xed\xec^\x11\x11@&\x84D@\x11Rs\x9c\xeagf\xa6\xc0q\ \x1c4\x99\xfc\x9c\x07V\\\xd9\x04u\xaa\x9f\xa3\xe4=\xba\xbf\xdd\x1fB=F\x94T\\\ \x83\xc4\x87d_N98\x05\xabq\xec\xbf\x1d\r\xe9}\xa9\xa0z\x82\t\xd0\xfd\x8a\xc6\ \x1c.\xbc\xa3\xcd&\xa3\x95\xf3i;on\x8b\xbb\xb2+\xe7\xc4\xa8\xc6V\x8d\xe5\xd3\ $\xda\xbe\xd7\xea\xfb+\x9aZ*\x8f\x14\xe6\xd1d\xf2\xae\xc0\xe6\xb1\x960Y\xd6\ \xb7\xaf\xd8\xec89\xb34z\xba\'\x8e\xcf\x1f\xa3\xf1\xf5\xd1V\xe5\x12\xbe\x1e\ \xa5\trTV\x7f\x9e\xe9\xe8\xd7tJ\xa3^\x9dx\xf3Sw_\xb95\xaa}\x98DE\xc4\xec\x11\ @\xfd\xb5\xe1\xbf\xce\xdc\x0f\xf4\x8c_\xdeO\xb6\xbc7\xf9\xdb\x81\xfe\x91\x8b\ \xfb\xcaG\xf11\x1f#\xfb3\x87\xf2\xa8|\xeb\xee\x89\xe4P?mxo\xf3\xb7\x03\xfd#\ \x17\xf7\x97en]\xc5,\xd8\x8e\xb5nO\x84\x9ai^\x19\x1cq\xdf\x89\xce{\x89\xd0\ \x00\x07l\x92{i\x1e\x12\xba\xcd\xc1\xfd\x98X\xaa/%5\xf7D\xd2"(\xe7p\x88\xa8\ \x14%~\x1b\x90\xe43W\x04\xb2yv\xacW\xcaH^O\x95\x13\xde\xd9*\xca\x01\xfeM\x91\ \xfc\'^\x84\xb8\xfc\x9c\xa4P\xc3\xf8\xcaVy\xaf\x7f\xdd=Z]H\xf5\xeb\xf8M]d\ \xfd\xbfu\xf4M\xf4/\xe8\xb8k\x9a\xe6\x874\x874\x8d\x82\x0e\xc1\x0b\x95\x1c\ \x90\x11\x11\x00EX\xces\\n2\xd8\xab\x15<\x8eNO\xec\xa3\xf3\xae5\xc6x\xcf5\ \x8b\x1c\xc7d\xa2e\xa0\xd9\xea_\x92\xe0\x92\xbb\x9a\xe2t\x1c\xd6\xb5\xaf\r\ \x04h\xb8<\x9d\r\xe8\xfa-\x02\x99\xe6\x975\xc3\xd3\xe5\xbd\x1f\x967[\x1d\xc7\x85\xd8\ \x05\x8eP\xdb\xb4"\x90\x98\xa8\xf4I\x1c\xc7g\xbbIp\xf2X\x0e\x86\xdc\x1e\xef\ \xa8\x07\xd1l\xb4\xf9._7\x81o\x17\xe0p\xc9r\xd4\xee\x95\xd9\x0c\xdb\xa3tuk\ \xbeW\xbaILe\xc3n=Ov\xbboZ#\x7f-G\x14\xa7QS\xa5\x07\xe7K\xe2sMF\xdb\xe7\x9a~\ \x8f_\x87\xaa3|6t\xddJ\x92^G\xfdTZn\xfbe\x93^\xbd>.\x8c\xd3\xb8\xbes\x1f\xc9\ 0u\xb3\x18\xb9|\xca\xd6\x1b\xb1\xbe\xcei\xf4-p\xf9\x10{)5\x05\xc0\xf8\xbd\ \x1e!\xc6\xab\xe1h\x12\xf0\xcd\xbeY\\4e\x90\xfd\xe7\x1f\xd8\x00\x1f \x00S\ \xab\xf3\xccJ\xa4\xabIQw\x8d\xdd\xaf\xad\xba\x1b\xdc;\xaa\xe9E\xd5^kgm\xc2\ \x8c\xe5\x94\xa6\xc9q\\\xbe:\xb1\xd4\xf6\xa8\xcd\x04gz\xf8\x9c\xc2\xd1\xfdeI\ \xa2\xe7Nn\x12RZ\xa3\xa4\xe0\xa7\x17\x17\xd4\xc3|\x1c\xc9A.W\x16\xeb6\xd9J\ \xcdV:\xac\xf0\xcczIx\x828]\x11\xdf\xa3\xb5Z\xbb\xc0>\xbf\x94\x03\xbb\n\xb4\ \xe5\xf1\x91\xb3\x97bp\xb8\xfe\xe6\x19\x9da\xc0k`>\xecv\xe4q\x1f\x9a\xc6\xf9\ A\x80\x9e\xc4\xca\x00\xde\x8e\xa2\xbf\x84_\x17\xc63\x89[\xe4\xf4j\xb6\xaeI\ \x92\xc6,\xd8\x84\x96\x19cq\xe9\xd3\x80:wr\xde\xe4o\xb2\xb7x6\xdcd\x9e\x1d\ \xe2nc(W\xa8\xd9\xe0\x02A\x14a\xa5\xcea,%\xc4\x0f\x88\xec\x1e\xe7\xb9Z\x9cex\ J\x82\xe2\x14\xaff\xf9yz)Y\xbdz\xac\xdd\xb2\xfb\x19\x9c%\t\xc6\xb3\xc0T\xb5\ \xd2\xe6\xe6\xea\xd5\xd2\xd3\xa6\x8a\xf9\xfd\xcb\x82",\x99\xa8\n\xa5\xe2\xce\ f\xee\x1f\x87H1\x8e\xe8\xc9_\x9e:\x14\xdd\xbdt\xc9)\xd6\xff\x00H\x1dD~ +j\ \xa1x\xde\x1d\x07\x1d\xc4\xe6zK\xa1\xc3\xe6\xea_\x9c\x01\xbf\x81\xae-?\xdb\n\ w\x0c\x84g\x8b\xa7\x19+\xe6\xb2\xdfe\xf5y\x10\xb8\x8c\xe5\x0c-IE\xdb-v\xdd\ \xfd\x11e\xe1\xdcz\x87\x17\xe3\xf5\xf1\x18\xf6\x00\xc8\x9b\xb9$#\xe2\x9aC\ \xf7\x9e\xe3\xf3$\xfe\xe1\xe8\x16)\xc4<5\xafG\xc5J\xf37\x9dal\x9a\x97]+k\xc5\ l:\xec\x8ea.,{>G\xb1\xea\xee{o\xb2\xf4\x0cR2X\x99,Ok\xd8\xf6\x875\xcd;\x04\ \x1fB\x17\x9ax\x1f\xfaPX\xff\x00\x9bd\xbf\xb3:\xbb\xe0\x951\x15!\x8b\x9a\xa9\ g\xc8\xdb\xc9;\xeb\xb9O\xc6)\xd0\xa7<,\x1c.\xb9\x92Y\xb5m66\xff\x00\x13\xb8\ \xe3y\x07\x1a\x95\xf5\xc9\x87+@\x1bX\xeb,\xec\xf8\xa6op\x01\xfa\x1dh\xfe\xdf\ \x90_\xbb\x81\xe6\xfe\xd1\xf0\xec^h\x80\xd7\xda\xae\x1d \x1e\x81\xe3\xe1x\ \x1f\x87P+\xf6\xe7\xf2Pa\xf0wr\xb6\\\x1b\rH\x1f+\x89\xf9\xe8o_\xaf\xd1W<\x14\ \xa167\xc2\xdc\x15i\xda[!\x81\xd3h\xfa\x81#\xdd \xfe\xa7\x05Qw>\x1f\xe6\xfe\ \xb2I}Sr\xfcG\xef\xdc\xb5\xb2\x8e;\xcb\xfd\xa2\xef\xf4j\xdf\x97\xf6\xec\\Q\ \x11V\x96\x05\x0b\xf8>\xff\x00\x9a\x1c\'\xfd\xc7\xff\x00D\x8a\xfa\xb3\xbf\ \x03`\x96\xd7\x82\xd8x \xbbb\x8c\x8e\xf3\xf5<\ra{5fC\xd8=\xaeo\x7fN\xe0\xfa\ \xfd{\xae\xff\x00\xf2\xa7\xfe=\xff\x00\xc2\xab\x9c~\x1b\xc6\xc7W|\xf1^yj\xed\ \xd5\x95\x18\x1cG\x85\x82\xa0\xb9\x1b\xf2GE~\x88\xec\xf1\xab\x1d\xcbr|Z\xb4\ \x1c6Kq\xe4\x1by\x8f\x90\xd6\xb6+\xbb\xca\xe8\x90\x1d\xb8\xb9\xbb\x1dE\xbd\ \xb7\xf4\xfa,\xbe\x86\'\x9eb\xfd\xcb\xf6\xda[\xf2y\xbc\xaf\x15\xec\x9e\xd5|Y\ \xd6\xbc\xfe\xbdi\xee\xe9\xf5g\xd3}\xbe\x8b\\\xe3\xfe\xff\x00\xf7\xbc\x1e\ \xdb\xf6\xbb\xd9\xfe.\xbfn\xf7_\x93\xf7N\xba\xbc\x8f\xcaz\xeb]?=o\xb6\xd4\ \x7f\x8c\x7f\xeag\xfdWG\xff\x00\xda\x9f\xc3\xf1\xb3\xa1\xcb\x82\xb4$\x9d\xdf\ 2\xcd\xef\xaf\xd3m\x08X\xec\x1c+\xdf\x19y\xa6\xac\xac\xf2[i\xf5\xdfR\xfa\x88\ \x8b.i\x08^]\x91\xb3\x8f\xadM\xb5\xa5\x8a\xb1\xb7m\xb5\x9djVu\xb6\xb8-q\xea\ \xd6\xc6\xc9-\x0c\x1f.\xa7\xb7\xd7\xd0\xc2\xf1\xbb\xd4\xebr\xeb8\xfb\xf7r\ \xb62W\xe0kc7\xd9]\x8d\x928\xba\xdd\xf02 \x1c\xdf\xe3\x1c\x7f(\xd0H\x1d\xbd\ \x14\xe76\xc4\x1c\xef\x15\xc8b\xe3\xe8\xf3\xa5\x8b\xaa\x02\xf0\x0bD\xad!\xf1\ \x97\x02\x08 =\xad$\x10A\x1d\x96=\x94\xc9Q\xc5Tl\x14\xf7\x8e\x97\x1f}\x92H\ \xcb\xf3U\x85\xd0\xc8\xc9\x00>]j\xa1\xady::|\xbd \x8fC\xdfj\xfb\x86\xe1\xa1\ \x8a\xa0\xe1\x1f\x89\xe4\xf7\xe8\xd3\xbe\xba\xea\xbbnRq\x0cD\xb0\xd5\x94\xde\ \x9a\xf6\xd9\xabi\xa6\x8f\xbe\xc6\x9bi\xd9>!\x8f\xb1%j\xd5.`k~[O\xb2\xe8\xe6\ \xa9\x0f\xab\xda\xd6\x868=\xad\x1b-\x05\xcd\xd0\xf8~@\xabb\xa8\xe5\xadd\xb9f\ &\xe6;\x0f\x8e\x88b\xaeDk\xc9z\xe4\xcf\x84\xbd\x8f\x1a{\xa2\x8f\xcb%\xe04\ \x9d\x17\x16\x02Om\x8e\xea\xdc\xabqI\xa8\xc5\xcd%<\xef\xbfK_g\xad\xf4{\x96\ \x18f\xb9\x9a\x83\xf2\xe5o}7Zn\xb6\n\xb9\xce+\xd7\xb8\xfc\x15\x1b\xd0Gb\x85\ \x8c\xa3Yf)X\x1c\xc7\xb4C+\x98\x1c\x0fb<\xc6\xc7\xeb\xf8+\x1a\xfc\xf9\x1a4\ \xb2U\x1fO#N\xbd\xca\xcf\xd1t3\xc4$c\xb4v6\xd2\x08=\xfb\xae4*xu\x14\xbf}Wu\ \xaa;W\xa7\xe2A\xc7\xf7\xd3\xeaWy\xe7\x1b\xafw\x84I\x8a\xc6\xd0\xae\xd8\xea\ \xb9\x93\xc3J6\x16C\'C\xba\x8ce\x8c\xd6\xc1\xef\xf0\xf6\xd9\xd7q\xea\xa9\x10\ d\xb2\x12{\x06n\x95\xa6I\x8d\xc2\xcc\xcb\x12\xd6c\xe0h\xa7\x0fId\xc0\xc3\x14\ G\xa3Q9\xe41\xd3\x97lo\xa4\xebJ\xf4Y\x17\x11\xb7\xd7\x14m\x8b\x8fXp\xebk\x06\ \x99BC\xdb\xa8\x0f\xcd\x89\xdd\xb7\xae\xcdw\x7fG\x12:1\xb8\xe7\xe6\xaed\xeb\ \xe6\xb2\xf9\x0bM\xadvF\xba\x811\xc7\x07\x94O\\C\xe0`s\xdb\xd0\xe6\x83\xd4\ \xe2\t\x0e\x07}\xc2\xb8\xc3b<*O\x9f\xcd\rn\xf5\xcf)e\x9e}\x1ek^\xa9\xa2\xa7\ \x11C\xc4\xa8\xb9<\xb2\xd2\xcbL\xb3Y\xe5\x97U\x93\xd3\xa3L\xb7""\xa1.\xc2" \ \x08\x88\x80""\x00\xa3\xfd\xc9\x89\xf7\xf7\xbf\xbd\xdf_\xde~W\x93\xed=?\x1fG\ \xd3\x7f\xd5\xbf]v\xf4R\x08\xbdFr\x8d\xf9]\xaey\x94#+s+\xd8""\xf2z\x08\x88\ \x80""\x00\xba\xec\xd7\x82\xd4.\x82\xcc\x11O\x13\xbe\xf3$`sO\xe9\x05v"\xfa\ \x9bN\xe8\xf8\xd2y2\x11\xbcC\x89\xb2_5\xbc_\x08\xd97\xbe\xb1B \x7foJ\x99\x8a\ 6E\x1bc\x89\x8dc\x1a4\xd6\xb4h\x01\xf8\x05\xf4\x8b\xdc\xea\xd4\xa9\xf1\xc9\ \xbfVy\x85(C\xe1I\x04D\\\xcfa\x11\x10\x15\x7f\x16q\xef\xc9\xf8o\x9e\xa9\x1cn\ \x92CQ\xd21\x8d\x1b.s>0\x00\xf9\x9d\xb5V\xff\x00\x83U\x89f\xf0\xc2(\xa4c\xda\ +\xdb\x9a6u\rl\x12\x1d\xb1\xf5\x1bq\x1f\xa4\x15\xa6"\x9f\x1csX)a\x1crrR\xbe\ \xd9[B\x0c\xb0W\xc6G\x14\xa5\xa4\\m\xbew\x08\x88\xa0\x13\x82\xe8\xc8\xd3\xad\ \x90\xa1=\x1b\xb0\xb6j\xd6#tr\xc6\xefG4\x8d\x10\xbb\xd1}M\xc5\xdd\x1f\x1aMY\ \x99\x9e>\xcf#\xf0\xe0{\xae\xee:\xee\x7f\x8cF\x7f\xc4\xeeTg\x99f\xa3>Q\xc8\ \xcf\xceh\xf98z\x0f\xd4\x06_\xc6\xaf\xe1q\xde1\xcb\xcb\xdd\x9b\x82\xcc\x0f\ \xbdn\xc0\xa5\x05[&\xd6\xa5\x12\x06\xb7\xa4\xc4\x1b\xd5\xf1\x8d\xfc_^\xe5zq\ \x15\xf6\x1f\x8d\xc6\x9a\xa9\xcfO\xcd5i4\xed{\xf5i\xa7\x9f\xa5\x8aLG\x07\x95\ GO\x92\xa6Pw\x8aj\xf6\xb7{\xac\xbdnf\xb2S\xcf\xf8\x8fr\x0f|cl`\xf8\x9c\x12\t\ M;\x1d\xac\xe4\x1c\x0e\xda$h\xfb\x8c\x07\xbe\xbe\x7f\x8fb\xdd%\xadkZ\x1a\xd0\ \x1a\xd04\x00\x1d\x80\\\xa2\xaa\xc4\xe2\x9dkE.X\xad\x12\xf7\xf5o\xab\xff\x00\ VE\x9e\x1f\x0c\xa8\xdeM\xf3I\xea\xdf\xeeIt_\xec""\x8aI2\xcf\x03yG\x1a\xc7x[\ \x87\xa7\x90\xe4X\x8a\x96c\xf3\xfa\xe1\x9e\xecl{w<\x84m\xa4\xecl\x10\x7fZ\ \x9e\xc8]\xf0\xa3#q\xf72\x16\xf8U\xbb2k\xaei\xe4\xaa\xf7\xbb@\x01\xb7\x1e\ \xe7@\x01\xfa\x95\xd5\x15\x95lu:\x98\x89\xd7\x8ce\x17&\xdeR\xdd\xdf\xe5+\xe8\ \xe0\xe7N\x84(\xb7\x16\xa2\x92\xce;+|\xc5+\x1fw\xc2\x8cu\xc6\\\xc7\xdb\xe1U,\ \xc7\xbe\x89\xa0\x92\xab\x1e\xdd\x82\x0e\x9c;\x8d\x82G\xebP\x9e\'r\x1c\x06Z\ \xc7\x0e\xad\x8a\xcec/\xcc\xdeSE\xee\x8e\xb5\xb6J\xe0\xd0^6CI:\xd9\x03\x7f\ \x88Z\x82%,l)\xd5U\\d\xda\xde_\xf9\x150s\x9d\'I4\x93\xda?\xf6\x11\x11V\x96\ \x01\x11\x10\x04D@\x11\x11\x01\xf3,l\x96\'E+\x1a\xf8\xde\x0b\\\xd7\r\x87\x03\ \xea\x08U\xde\x17\x81\xbd\x873I\x90\xb3\x0c\xd2{4\x14\xa1\x11\x03\xa1\x0c\ \x1d}\x05\xc4\xfa\xbc\xf5\x9d\xfe\xa1\xdf[VD]cZp\x84\xa9\xad\x1d\xaf\xf49J\ \x94e8\xcd\xea\xbf\xd8DE\xc8\xea\x11\x11\x00DD\x01\x11\x10\x04D@\x11\x11\x00\ DD\x01\x11\x10\x04D@\x11\x11\x00DD\x01\x11\x10\x04D@\x11\x11\x00DD\x01\x11\ \x10\x04D@\x11\x11\x00DD\x01\x11\x10\x04D@\x11\x11\x00DD\x01\x11\x10\x04D@\ \x11\x11\x00DD\x01\x11\x10\x04D@\x11\x11\x00DD\x01\x11\x10\x04D@\x11\x11\x00\ DD\x01\x11\x10\x04D@\x11\x11\x00DD\x01\x11\x10\x04D@\x11\x11\x00DD\x01\x11\ \x10\x04D@\x11\x11\x00DD\x01\x11\x10\x04D@\x11\x11\x00DD\x01\x11\x10\x04D@\ \x7f\xff\xd9~BK\x00\x03\x00P\x06\x00\x00~BK\x00\x04\x00\x81\x01\x00\x00\x83\ \x00\x00\x00\n\x00Background\x01\x00\x00\x00\x00\x00\x00\x00\x00\x90\x01\x00\ \x00F\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x90\x01\x00\x00F\x00\x00\ \x00\xff\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x01\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\ \xff\xff\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\xff\xff\x00\ \x00\xff\xff\x00\x00\xff\xff\x08\x00\x00\x00\x01\x00\x03\x00~BK\x00\x05\x00H\ \x00\x00\x00\x10\x00\x00\x008\x00\x00\x00HH\x01\x00\x00\x00\x01\x00x\x9c\xec\ \xc11\x01\x00\x00\x00\xc2\xa0\xfe\xa9g\r\x0f\xa0\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00>\x0c\x00\x00\xff\xff\x03\x00\xa7\x88\xf8\xf5~BK\x00\x05\x00H\x00\x00\ \x00\x10\x00\x00\x008\x00\x00\x00HH\x01\x00\x00\x00\x02\x00x\x9c\xec\xc11\ \x01\x00\x00\x00\xc2\xa0\xfe\xa9g\r\x0f\xa0\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ >\x0c\x00\x00\xff\xff\x03\x00\xa7\x88\xf8\xf5~BK\x00\x05\x00H\x00\x00\x00\ \x10\x00\x00\x008\x00\x00\x00HH\x01\x00\x00\x00\x03\x00x\x9c\xec\xc11\x01\ \x00\x00\x00\xc2\xa0\xfe\xa9g\r\x0f\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00>\ \x0c\x00\x00\xff\xff\x03\x00\xa7\x88\xf8\xf5~BK\x00\x04\x00\xbb\x04\x00\x00}\ \x00\x00\x00\x04\x00Text\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\xff\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x01\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\ \xff\xff\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\xff\xff\x00\ \x00\xff\xff\x00\x00\xff\xff~BK\x00\r\x00,\x04\x00\x00\x08\x00\x00\x00\x01\ \x00\x00\x00~BK\x00\x0e\x00\x1a\x04\x00\x00\x14\x00\x00\x00\x00\x00\x01\x00\ \x07\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00U\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c@\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00B@\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\ \x08\x00\x00\x00\x0c\x00\x00\x00\x06\x00\x00\x00\x03\x00\x08\x00\x00\x00\x00\ \x00\x00\x00\x06\x00\x00\x00\x02\x00Y\x00\x00\x00\x08\x00Webdings\x00\x00\ \x00\x00\x90\x01\x00\x00\x02\x00\x00\x005\x00\x00\x00\x00UUUUUU5@\x00\x01\ \x00\x01\x00\x00\x01\xd7\xa3p=\n\xd7\x1c@\xd7\xa3p=\n\xd7\x1c@\x00\x00\x00\ \x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\ \x00\x00$@~BK\x00\x0f\x00\x12\x00\x00\x00\x06\x00\x00\x00\x01\x00\x0c\x00\ \x00\x00\x00\x00\x00\x00\xff\xff\xff\xff~BK\x00\x0f\x00\x12\x00\x00\x00\x06\ \x00\x00\x00\x01\x00\x0c\x00\x00\x00\x00\x7f\xf6\x00\xff\xff\xff\xff~BK\x00\ \x13\x00-\x00\x00\x00-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x01\x00\x08\ \x00\x00\x00a\x00\x00\x00\x06\x00\x00\x00\x02\x00Y\x00\x00\x00\x08\x00Webdin\ gs\x00\x00\x00\x00\x90\x01\x00\x00\x02\x00\x00\x00%\x00\x00\x00\x00\xde\xdd\ \xdd\xdd\xdd\xdd-@\x00\x01\x00\x01\x00\x00\x01\xd7\xa3p=\n\xd7\x1c@\xd7\xa3p\ =\n\xd7\x1c@\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\ \xf0?\x00\x00\x00\x00\x00\x00\x00$@~BK\x00\x0f\x00\x12\x00\x00\x00\x06\x00\ \x00\x00\x01\x00\x0c\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff~BK\x00\x0f\ \x00\x12\x00\x00\x00\x06\x00\x00\x00\x01\x00\x0c\x00\x00\x00\x00\x7f\xf6\x00\ \xff\xff\xff\xff~BK\x00\x13\x00-\x00\x00\x00-\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\ \x00\x00\x00\x01\x00\x08\x00\x00\x00N\x00\x00\x00\x06\x00\x00\x00\x01\x00\ \x08\x00\x00\x00L\x00\x00\x00\x06\x00\x00\x00\x01\x00\x08\x00\x00\x00i\x00\ \x00\x00\x06\x00\x00\x00\x02\x00Y\x00\x00\x00\x08\x00Webdings\x00\x00\x00\ \x00\xbc\x02\x00\x00\x02\x00\x00\x00%\x00\x00\x00\x00\xde\xdd\xdd\xdd\xdd\ \xdd-@\x00\x01\x00\x01\x00\x00\x01\xd7\xa3p=\n\xd7\x1c@\xd7\xa3p=\n\xd7\x1c@\ \x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\ \x00\x00\x00\x00\x00$@~BK\x00\x0f\x00\x12\x00\x00\x00\x06\x00\x00\x00\x01\ \x00\x0c\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff~BK\x00\x0f\x00\x12\x00\ \x00\x00\x06\x00\x00\x00\x01\x00\x0c\x00\x00\x00\x00\x7f\xf6\x00\xff\xff\xff\ \xff~BK\x00\x13\x00-\x00\x00\x00-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\ \x01\x00\x08\x00\x00\x00\x9b\x00\x00\x00\x06\x00\x00\x00\x02\x00Z\x00\x00\ \x00\t\x00Wingdings\x00\x00\x00\x00\x90\x01\x00\x00\x02\x00\x00\x00%\x00\x00\ \x00\x00\xde\xdd\xdd\xdd\xdd\xdd-@\x00\x01\x00\x01\x00\x00\x01\xd7\xa3p=\n\ \xd7\x1c@\xd7\xa3p=\n\xd7\x1c@\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\ \x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00$@~BK\x00\x0f\x00\x12\x00\ \x00\x00\x06\x00\x00\x00\x01\x00\x0c\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\ \xff~BK\x00\x0f\x00\x12\x00\x00\x00\x06\x00\x00\x00\x01\x00\x0c\x00\x00\x00\ \x00\x7f\xf6\x00\xff\xff\xff\xff~BK\x00\x13\x00-\x00\x00\x00-\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ \x00\x00\x00\x06\x00\x00\x00\x01\x00\x08\x00\x00\x00C\x00\x00\x00\x06\x00\ \x00\x00\x01\x00\x08\x00\x00\x00D\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\ \x00' ### end From montanaro at users.sourceforge.net Sat Jan 18 06:51:54 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sat Jan 18 09:51:57 2003 Subject: [Spambayes-checkins] spambayes proxytee.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv24266 Modified Files: proxytee.py Log Message: add prob= flag to allow random subset of messages to be passed to trainer Index: proxytee.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/proxytee.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** proxytee.py 16 Jan 2003 17:38:28 -0000 1.1 --- proxytee.py 18 Jan 2003 14:51:52 -0000 1.2 *************** *** 5,9 **** web browser and write it to standard output. ! usage: %(progname)s [-h] [-n] [-s server] [-p port] Options: --- 5,9 ---- web browser and write it to standard output. ! usage: %(progname)s [-h] [-n] [-s server] [-p port] [-r N] Options: *************** *** 12,15 **** --- 12,16 ---- -s, --server= - provide alternate web server (default %(server)s) -p, --port= - provide alternate server port (default %(port)s) + -r, --prob= - feed the message to the trainer w/ prob N [0.0...1.0] """ *************** *** 18,21 **** --- 19,23 ---- import mimetypes import getopt + import random from spambayes.Options import options *************** *** 91,99 **** server = "localhost" port = options.html_ui_port try: ! opts, args = getopt.getopt(argv, "hns:p:", ! ["help", "null", "server=", "port="]) ! except getopt.Error: usage(globals(), locals()) sys.exit(1) --- 93,103 ---- server = "localhost" port = options.html_ui_port + prob = 1.0 try: ! opts, args = getopt.getopt(argv, "hns:p:r:", ! ["help", "null", "server=", "port=", ! "prob="]) ! except getopt.error: usage(globals(), locals()) sys.exit(1) *************** *** 109,112 **** --- 113,122 ---- elif opt in ("-p", "--port"): port = int(arg) + elif opt in ("-r", "--prob"): + n = float(arg) + if n < 0.0 or n > 1.0: + usage(globals(), locals()) + sys.exit(1) + prob = n if args: *************** *** 115,121 **** data = sys.stdin.read() - post_multipart("%s:%d"%(server,port), "/upload", [], - [('file', 'message.dat', data)]) sys.stdout.write(data) if __name__ == "__main__": --- 125,136 ---- data = sys.stdin.read() sys.stdout.write(data) + if random.random() < prob: + try: + post_multipart("%s:%d"%(server,port), "/upload", [], + [('file', 'message.dat', data)]) + except: + print >> sys.stderr, "upload failed" + sys.exit(1) if __name__ == "__main__": From timstone4 at users.sourceforge.net Sat Jan 18 08:31:00 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Sat Jan 18 11:31:04 2003 Subject: [Spambayes-checkins] spambayes/spambayes Corpus.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv5561 Modified Files: Corpus.py Log Message: Corrected print message with error documented in 669149 Index: Corpus.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Corpus.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** Corpus.py 17 Jan 2003 20:02:37 -0000 1.3 --- Corpus.py 18 Jan 2003 16:30:57 -0000 1.4 *************** *** 265,269 **** if msg.createTimestamp() < time.time() - self.expireBefore: if options.verbose: ! print 'message %s has expired' % (key) self.removeMessage(msg) --- 265,269 ---- if msg.createTimestamp() < time.time() - self.expireBefore: if options.verbose: ! print 'message %s has expired' % (msg.key()) self.removeMessage(msg) From nascheme at users.sourceforge.net Sun Jan 19 19:14:34 2003 From: nascheme at users.sourceforge.net (Neil Schemenauer) Date: Sun Jan 19 22:14:37 2003 Subject: [Spambayes-checkins] spambayes/spambayes cdb_classifier.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv20129/spambayes Added Files: cdb_classifier.py Log Message: Factor out the CdbClassifer from mailsort.py. --- NEW FILE: cdb_classifier.py --- """A classifier that uses a CDB database. A CDB wordinfo database is quite small and fast but is slow to update. It is appropriate if training is done rarely (e.g. monthly or weekly using archived ham and spam). See mailsort.py for an example application that uses this classifier. """ from spambayes import cdb from spambayes.tokenizer import tokenize from spambayes.classifier import Classifier class CdbClassifer(Classifier): def __init__(self, cdbfile=None): Classifier.__init__(self) if cdbfile is not None: self.wordinfo = cdb.Cdb(cdbfile) def probability(self, record): return float(record) def save_wordinfo(self, db_file): items = [] for word, record in self.wordinfo.iteritems(): prob = Classifier.probability(self, record) items.append((word, str(prob))) cdb.cdb_make(db_file, items) From nascheme at users.sourceforge.net Sun Jan 19 19:18:48 2003 From: nascheme at users.sourceforge.net (Neil Schemenauer) Date: Sun Jan 19 22:18:52 2003 Subject: [Spambayes-checkins] spambayes mailsort.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv21314 Modified Files: mailsort.py Log Message: Moved CdbClassifer into its own module. Drop the DB file argument. Instead, look for it in an RC directory. The bayescustomize.ini is also loaded from the RC directory by default. Index: mailsort.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mailsort.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** mailsort.py 14 Jan 2003 05:38:19 -0000 1.2 --- mailsort.py 20 Jan 2003 03:18:45 -0000 1.3 *************** *** 2,17 **** """\ To train: ! %(program)s -t wordprobs.cdb ham.mbox spam.mbox To filter mail (using .forward or .qmail): ! |%(program)s wordprobs.cdb Maildir/ Mail/Spam/ To print the score and top evidence for a message or messages: ! %(program)s -s wordprobs.cdb message [...] """ SPAM_CUTOFF = 0.57 SIZE_LIMIT = 5000000 # messages larger are not analyzed BLOCK_SIZE = 10000 import sys --- 2,21 ---- """\ To train: ! %(program)s -t ham.mbox spam.mbox To filter mail (using .forward or .qmail): ! |%(program)s Maildir/ Mail/Spam/ To print the score and top evidence for a message or messages: ! %(program)s -s message [message ...] """ SPAM_CUTOFF = 0.57 + SIZE_LIMIT = 5000000 # messages larger are not analyzed BLOCK_SIZE = 10000 + RC_DIR = "~/.spambayes" + DB_FILE = RC_DIR + "/wordprobs.cdb" + OPTION_FILE = RC_DIR + "/bayescustomize.ini" import sys *************** *** 23,31 **** import socket import email - from spambayes import mboxutils ! from spambayes import cdb from spambayes.tokenizer import tokenize - from spambayes import classifier --- 27,38 ---- import socket import email ! DB_FILE = os.path.expanduser(DB_FILE) ! if not os.environ['BAYESCUSTOMIZE']: ! os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(OPTION_FILE) ! ! from spambayes import mboxutils ! from spambayes.cdb_classifier import CdbClassifer from spambayes.tokenizer import tokenize *************** *** 39,43 **** program = sys.argv[0] # For usage(); referenced by docstring above - def usage(code, msg=''): """Print usage message and sys.exit(code).""" --- 46,49 ---- *************** *** 48,59 **** sys.exit(code) - class CdbClassifer(classifier.Classifier): - def __init__(self, cdbfile): - classifier.Bayes.__init__(self) - self.wordinfo = cdb.Cdb(cdbfile) - - def probability(self, record): - return float(record) - def maketmp(dir): hostname = socket.gethostname() --- 54,57 ---- *************** *** 81,105 **** bayes.learn(tokenize(msg), is_spam) ! def train_messages(db_name, ham_name, spam_name): """Create database using messages.""" ! bayes = classifier.Classifier() print 'Training with ham...' train(bayes, ham_name, False) print 'Training with spam...' train(bayes, spam_name, True) ! print 'Updating probabilities...' ! items = [] ! for word, record in bayes.wordinfo.iteritems(): ! prob = bayes.probability(record) ! #print `word`, prob ! items.append((word, str(prob))) ! print 'Writing DB...' ! db = open(db_name, "wb") ! cdb.cdb_make(db, items) db.close() print 'done' ! def filter_message(db_name, hamdir, spamdir): signal.signal(signal.SIGALRM, lambda s: sys.exit(1)) signal.alarm(24 * 60 * 60) --- 79,101 ---- bayes.learn(tokenize(msg), is_spam) ! def train_messages(ham_name, spam_name): """Create database using messages.""" ! rc_dir = os.path.expanduser(RC_DIR) ! if not os.path.exists(rc_dir): ! print "Creating", RC_DIR, "directory..." ! os.mkdir(rc_dir) ! bayes = CdbClassifer() print 'Training with ham...' train(bayes, ham_name, False) print 'Training with spam...' train(bayes, spam_name, True) ! print 'Update probabilities and writing DB...' ! db = open(DB_FILE, "wb") ! bayes.save_wordinfo(db) db.close() print 'done' ! def filter_message(hamdir, spamdir): signal.signal(signal.SIGALRM, lambda s: sys.exit(1)) signal.alarm(24 * 60 * 60) *************** *** 126,130 **** msg = email.message_from_string(msgdata) del msgdata ! bayes = CdbClassifer(open(db_name, 'rb')) prob = bayes.spamprob(tokenize(msg)) else: --- 122,126 ---- msg = email.message_from_string(msgdata) del msgdata ! bayes = CdbClassifer(open(DB_FILE, 'rb')) prob = bayes.spamprob(tokenize(msg)) else: *************** *** 139,145 **** raise ! def print_message_score(db_name, msg_name): msg = email.message_from_file(open(msg_name)) ! bayes = CdbClassifer(open(db_name, 'rb')) prob, evidence = bayes.spamprob(tokenize(msg), evidence=True) print msg_name, prob --- 135,141 ---- raise ! def print_message_score(msg_name): msg = email.message_from_file(open(msg_name)) ! bayes = CdbClassifer(open(DB_FILE, 'rb')) prob, evidence = bayes.spamprob(tokenize(msg), evidence=True) print msg_name, prob *************** *** 157,171 **** if not opts: ! if len(args) != 3: usage(2, 'wrong number of arguments') ! filter_message(args[0], args[1], args[2]) elif opts[0][0] == '-t': ! if len(args) != 3: usage(2, 'wrong number of arguments') ! train_messages(args[0], args[1], args[2]) elif opts[0][0] == '-s': ! db = args[0] ! for msg in args[1:]: ! print_message_score(db, msg) else: raise RuntimeError # shouldn't get here --- 153,166 ---- if not opts: ! if len(args) != 2: usage(2, 'wrong number of arguments') ! filter_message(args[0], args[1]) elif opts[0][0] == '-t': ! if len(args) != 2: usage(2, 'wrong number of arguments') ! train_messages(args[0], args[1]) elif opts[0][0] == '-s': ! for msg in args: ! print_message_score(msg) else: raise RuntimeError # shouldn't get here From nascheme at users.sourceforge.net Sun Jan 19 19:39:55 2003 From: nascheme at users.sourceforge.net (Neil Schemenauer) Date: Sun Jan 19 22:39:58 2003 Subject: [Spambayes-checkins] spambayes mailsort.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv26999 Modified Files: mailsort.py Log Message: Argh, need to use has_key(). Index: mailsort.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mailsort.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** mailsort.py 20 Jan 2003 03:18:45 -0000 1.3 --- mailsort.py 20 Jan 2003 03:39:52 -0000 1.4 *************** *** 29,33 **** DB_FILE = os.path.expanduser(DB_FILE) ! if not os.environ['BAYESCUSTOMIZE']: os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(OPTION_FILE) --- 29,33 ---- DB_FILE = os.path.expanduser(DB_FILE) ! if not os.environ.has_key('BAYESCUSTOMIZE'): os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(OPTION_FILE) From montanaro at users.sourceforge.net Mon Jan 20 07:05:49 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Mon Jan 20 10:05:52 2003 Subject: [Spambayes-checkins] spambayes/pspam/pspam options.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/pspam/pspam In directory sc8-pr-cvs1:/tmp/cvs-serv24475 Modified Files: options.py Log Message: fix reference to Options module Index: options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pspam/pspam/options.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** options.py 14 Jan 2003 05:38:20 -0000 1.3 --- options.py 20 Jan 2003 15:05:45 -0000 1.4 *************** *** 1,3 **** ! from Options import options, all_options, \ boolean_cracker, float_cracker, int_cracker, string_cracker try: --- 1,3 ---- ! from spambayes.Options import options, all_options, \ boolean_cracker, float_cracker, int_cracker, string_cracker try: From richiehindle at users.sourceforge.net Mon Jan 20 12:23:35 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Mon Jan 20 15:23:39 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.36,1.37 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv1839 Modified Files: pop3proxy.py Log Message: Prevent the POP3 proxy from picking up bogus files in the corpuses. Thanks to François Granger. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** pop3proxy.py 17 Jan 2003 20:21:04 -0000 1.36 --- pop3proxy.py 20 Jan 2003 20:23:30 -0000 1.37 *************** *** 1139,1149 **** self.spamCorpus = ExpiryFileCorpus(age, factory, options.pop3proxy_spam_cache, ! cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, options.pop3proxy_ham_cache, ! cacheSize=20) self.unknownCorpus = FileCorpus(factory, options.pop3proxy_unknown_cache, ! cacheSize=20) # Expire old messages from the trained corpuses. --- 1139,1149 ---- self.spamCorpus = ExpiryFileCorpus(age, factory, options.pop3proxy_spam_cache, ! '[0123456789]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, options.pop3proxy_ham_cache, ! '[0123456789]*', cacheSize=20) self.unknownCorpus = FileCorpus(factory, options.pop3proxy_unknown_cache, ! '[0123456789]*', cacheSize=20) # Expire old messages from the trained corpuses. From richiehindle at users.sourceforge.net Mon Jan 20 12:24:57 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Mon Jan 20 15:25:00 2003 Subject: [Spambayes-checkins] spambayes/pspam pop.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/pspam In directory sc8-pr-cvs1:/tmp/cvs-serv2840 Modified Files: pop.py Log Message: Corrected the spelling of my name. 8-) Index: pop.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pspam/pop.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** pop.py 15 Jan 2003 22:45:11 -0000 1.4 --- pop.py 20 Jan 2003 20:24:54 -0000 1.5 *************** *** 6,10 **** classifier loaded from a ZEO server. ! The strategy for adding spam headers is from Richie Hindler's pop3proxy.py. The STAT, LIST, RETR, and TOP commands are intercepted to change the number of bytes the client is told to expect and/or to --- 6,10 ---- classifier loaded from a ZEO server. ! The strategy for adding spam headers is from Richie Hindle's pop3proxy.py. The STAT, LIST, RETR, and TOP commands are intercepted to change the number of bytes the client is told to expect and/or to From richiehindle at users.sourceforge.net Mon Jan 20 12:32:55 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Mon Jan 20 15:33:02 2003 Subject: [Spambayes-checkins] website index.ht,1.10,1.11 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv8401 Modified Files: index.ht Log Message: Corrected the spelling of my name. Becoming a habit. 8-) Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** index.ht 17 Jan 2003 11:53:39 -0000 1.10 --- index.ht 20 Jan 2003 20:32:52 -0000 1.11 *************** *** 120,124 **** also contributed maths/stats clues. Mark Hammond amazed the world with the Outlook2000 plug-in, ! and Rich Hindle, Neale Pickett, Tim Stone worked on the end-user applications.

    (Thanks also to Rachel Holkner for turning Anthony's gibberish into something closer to actual English, although all mistakes are Anthony's.)

    --- 120,124 ---- also contributed maths/stats clues. Mark Hammond amazed the world with the Outlook2000 plug-in, ! and Richie Hindle, Neale Pickett, Tim Stone worked on the end-user applications.

    (Thanks also to Rachel Holkner for turning Anthony's gibberish into something closer to actual English, although all mistakes are Anthony's.)

    From montanaro at users.sourceforge.net Mon Jan 20 12:46:20 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Mon Jan 20 15:50:09 2003 Subject: [Spambayes-checkins] spambayes proxytee.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv16928 Modified Files: proxytee.py Log Message: Don't treat it as an error if the proxy server isn't available. Many people (myself included) still run versions of procmail which don't properly recover from filter errors and wind up generating From_ lines missing the "F". Index: proxytee.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/proxytee.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** proxytee.py 18 Jan 2003 14:51:52 -0000 1.2 --- proxytee.py 20 Jan 2003 20:46:18 -0000 1.3 *************** *** 131,136 **** [('file', 'message.dat', data)]) except: ! print >> sys.stderr, "upload failed" ! sys.exit(1) if __name__ == "__main__": --- 131,136 ---- [('file', 'message.dat', data)]) except: ! # not an error if the server isn't responding ! pass if __name__ == "__main__": From richiehindle at users.sourceforge.net Mon Jan 20 15:01:14 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Mon Jan 20 18:01:18 2003 Subject: [Spambayes-checkins] spambayes/spambayes PyMeldLite.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv23783 Modified Files: PyMeldLite.py Log Message: Made this work under 2.3a1. Index: PyMeldLite.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/PyMeldLite.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** PyMeldLite.py 17 Jan 2003 20:21:11 -0000 1.1 --- PyMeldLite.py 20 Jan 2003 23:01:10 -0000 1.2 *************** *** 616,620 **** return self._tree.getElementNode().childrenToText() if name.startswith('_'): ! return self.__dict__[name] node = self._findByID(self._tree, name) if node: --- 616,623 ---- return self._tree.getElementNode().childrenToText() if name.startswith('_'): ! try: ! return self.__dict__[name] ! except KeyError: ! raise AttributeError, name node = self._findByID(self._tree, name) if node: *************** *** 671,676 **** return if name.startswith('_'): ! del self.__dict__[name] ! return if self._readonly: raise ReadOnlyError, READ_ONLY_MESSAGE --- 674,682 ---- return if name.startswith('_'): ! try: ! del self.__dict__[name] ! return ! except KeyError: ! raise AttributeError, name if self._readonly: raise ReadOnlyError, READ_ONLY_MESSAGE *************** *** 1069,1073 **** Traceback (most recent call last): ... ! KeyError: _private >>> print page x --- 1075,1083 ---- Traceback (most recent call last): ... ! AttributeError: _private ! >>> del page._private ! Traceback (most recent call last): ! ... ! AttributeError: _private >>> print page x From richiehindle at users.sourceforge.net Mon Jan 20 15:22:06 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Mon Jan 20 18:22:09 2003 Subject: [Spambayes-checkins] spambayes/spambayes PyMeldLite.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv1058 Modified Files: PyMeldLite.py Log Message: Suppressed the "xmllib is obsolete" warning (for now). Index: PyMeldLite.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/PyMeldLite.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** PyMeldLite.py 20 Jan 2003 23:01:10 -0000 1.2 --- PyMeldLite.py 20 Jan 2003 23:22:02 -0000 1.3 *************** *** 194,200 **** import re, xmllib - # Entrian.Coverage: Pragma Stop try: True, False, bool --- 194,209 ---- + # Entrian.Coverage: Pragma Stop + try: + # XXX Take this seriously before 2.4 comes out... + import warnings + warnings.filterwarnings(action='ignore', + message='.*xmllib', + category=DeprecationWarning) + except ImportError: + pass + import re, xmllib try: True, False, bool From npickett at users.sourceforge.net Mon Jan 20 16:17:59 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Mon Jan 20 19:18:02 2003 Subject: [Spambayes-checkins] spambayes hammiefilter.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv24007 Modified Files: hammiefilter.py Log Message: * hammiefilter now supports -D and -d options like hammiebulk Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** hammiefilter.py 15 Jan 2003 22:41:15 -0000 1.8 --- hammiefilter.py 21 Jan 2003 00:17:57 -0000 1.9 *************** *** 35,38 **** --- 35,42 ---- -s train on stdin as a bad (spam) message + -d DBFILE + use database in DBFILE + -D PICKLEFILE + use pickle (instead of database) in PICKLEFILE -G untrain ham on stdin -- only use if you've already trained this *************** *** 56,59 **** --- 60,68 ---- # Options options = Options.options + options.mergefiles(['/etc/hammierc', + os.path.expanduser('~/.hammierc')]) + DBNAME = options.hammiefilter_persistent_storage_file + DBNAME = os.path.expanduser(DBNAME) + USEDB = options.hammiefilter_persistent_use_database def usage(code, msg=''): *************** *** 68,78 **** def __init__(self): options = Options.options ! options.mergefiles(['/etc/hammierc', ! os.path.expanduser('~/.hammierc')]) ! ! self.dbname = options.hammiefilter_persistent_storage_file ! self.dbname = os.path.expanduser(self.dbname) ! self.usedb = options.hammiefilter_persistent_use_database ! def newdb(self): --- 77,82 ---- def __init__(self): options = Options.options ! self.dbname = DBNAME ! self.usedb = USEDB def newdb(self): *************** *** 111,120 **** def main(): h = HammieFilter() action = h.filter ! opts, args = getopt.getopt(sys.argv[1:], 'hngsGS', ['help']) for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt == '-g': action = h.train_ham --- 115,132 ---- def main(): + global DBNAME, USEDB + h = HammieFilter() action = h.filter ! opts, args = getopt.getopt(sys.argv[1:], 'hngsGSd:D:', ['help']) for opt, arg in opts: if opt in ('-h', '--help'): usage(0) + elif opt == '-d': + USEDB = True + DBNAME = arg + elif opt == '-D': + USEDB = False + DBNAME = arg elif opt == '-g': action = h.train_ham From npickett at users.sourceforge.net Tue Jan 21 06:51:01 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Tue Jan 21 09:51:12 2003 Subject: [Spambayes-checkins] spambayes hammiefilter.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv6690 Modified Files: hammiefilter.py Log Message: * hammiefilter now has -t option for filter/train step * Options has new hammie_train_on_filter and hammie_trained_header options * hammie.py:Hammie.filter has new train kwarg to support filter/train in one step. Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** hammiefilter.py 21 Jan 2003 00:17:57 -0000 1.9 --- hammiefilter.py 21 Jan 2003 14:50:25 -0000 1.10 *************** *** 15,19 **** ## ! """Usage: %(program)s [OPTION] A hammie front-end to make the simple stuff simple. The intent is to call --- 15,19 ---- ## ! """Usage: %(program)s [OPTION]... A hammie front-end to make the simple stuff simple. The intent is to call *************** *** 26,51 **** calling it with either the -g or -s options, respectively. ! Where [OPTION] is one of: -h show usage and exit - -n - create a new database - -g - train on stdin as a good (ham) message - -s - train on stdin as a bad (spam) message -d DBFILE use database in DBFILE -D PICKLEFILE use pickle (instead of database) in PICKLEFILE -G ! untrain ham on stdin -- only use if you've already trained this ! message! -S ! untrain spam on stdin -- only use if you've already trained this ! message! - If neither -g nor -s is given, stdin will be scored: the same message, - with a new header containing the score, will be send to stdout. """ --- 26,54 ---- calling it with either the -g or -s options, respectively. ! [OPTION] is one of: -h show usage and exit -d DBFILE use database in DBFILE -D PICKLEFILE use pickle (instead of database) in PICKLEFILE + -n + create a new database + -g + train as a good (ham) message + -s + train as a bad (spam) message + -t + filter and train based on the result (you must make sure to + untrain all mistakes later) -G ! untrain ham (only use if you've already trained this message) -S ! untrain spam (only use if you've already trained this message) ! ! All processing options operate on stdin. If no processing options are ! given, stdin will be scored: the same message, with a new header ! containing the score, will be send to stdout. """ *************** *** 53,69 **** import sys import getopt ! from spambayes import hammie, Options # See Options.py for explanations of these properties program = sys.argv[0] - # Options - options = Options.options - options.mergefiles(['/etc/hammierc', - os.path.expanduser('~/.hammierc')]) - DBNAME = options.hammiefilter_persistent_storage_file - DBNAME = os.path.expanduser(DBNAME) - USEDB = options.hammiefilter_persistent_use_database - def usage(code, msg=''): """Print usage message and sys.exit(code).""" --- 56,64 ---- import sys import getopt ! from spambayes import hammie, Options, mboxutils # See Options.py for explanations of these properties program = sys.argv[0] def usage(code, msg=''): """Print usage message and sys.exit(code).""" *************** *** 77,82 **** def __init__(self): options = Options.options ! self.dbname = DBNAME ! self.usedb = USEDB def newdb(self): --- 72,80 ---- def __init__(self): options = Options.options ! options.mergefiles(['/etc/hammierc', ! os.path.expanduser('~/.hammierc')]) ! self.dbname = options.hammiefilter_persistent_storage_file ! self.dbname = os.path.expanduser(self.dbname) ! self.usedb = options.hammiefilter_persistent_use_database def newdb(self): *************** *** 85,144 **** print "Created new database in", self.dbname ! def filter(self): h = hammie.open(self.dbname, self.usedb, 'r') - msg = sys.stdin.read() print h.filter(msg) ! def train_ham(self): h = hammie.open(self.dbname, self.usedb, 'c') - msg = sys.stdin.read() h.train_ham(msg) h.store() ! def train_spam(self): h = hammie.open(self.dbname, self.usedb, 'c') - msg = sys.stdin.read() h.train_spam(msg) h.store() ! def untrain_ham(self): h = hammie.open(self.dbname, self.usedb, 'c') - msg = sys.stdin.read() h.untrain_ham(msg) h.store() ! def untrain_spam(self): h = hammie.open(self.dbname, self.usedb, 'c') - msg = sys.stdin.read() h.untrain_spam(msg) h.store() def main(): - global DBNAME, USEDB - h = HammieFilter() ! action = h.filter ! opts, args = getopt.getopt(sys.argv[1:], 'hngsGSd:D:', ['help']) for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt == '-d': ! USEDB = True ! DBNAME = arg elif opt == '-D': ! USEDB = False ! DBNAME = arg elif opt == '-g': ! action = h.train_ham elif opt == '-s': ! action = h.train_spam elif opt == '-G': ! action = h.untrain_ham elif opt == '-S': ! action = h.untrain_spam elif opt == "-n": ! action = h.newdb - action() if __name__ == "__main__": --- 83,148 ---- print "Created new database in", self.dbname ! def filter(self, msg): h = hammie.open(self.dbname, self.usedb, 'r') print h.filter(msg) ! def filter_train(self, msg): ! h = hammie.open(self.dbname, self.usedb, 'c') ! print h.filter(msg, train=True) ! ! def train_ham(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') h.train_ham(msg) h.store() ! def train_spam(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') h.train_spam(msg) h.store() ! def untrain_ham(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') h.untrain_ham(msg) h.store() ! def untrain_spam(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') h.untrain_spam(msg) h.store() def main(): h = HammieFilter() ! actions = [] ! opts, args = getopt.getopt(sys.argv[1:], 'hd:D:ngstGS', ['help']) for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt == '-d': ! h.usedb = True ! h.dbname = arg elif opt == '-D': ! h.usedb = False ! h.dbname = arg elif opt == '-g': ! actions.append(h.train_ham) elif opt == '-s': ! actions.append(h.train_spam) ! elif opt == '-t': ! actions.append(h.filter_train) elif opt == '-G': ! actions.append(h.untrain_ham) elif opt == '-S': ! actions.append(h.untrain_spam) elif opt == "-n": ! h.newdb() ! sys.exit(0) ! ! if actions == []: ! actions = [h.filter] ! ! msg = mboxutils.get_message(sys.stdin) ! for action in actions: ! action(msg) if __name__ == "__main__": From npickett at users.sourceforge.net Tue Jan 21 06:51:05 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Tue Jan 21 09:51:17 2003 Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.6,1.7 hammie.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv6690/spambayes Modified Files: Options.py hammie.py Log Message: * hammiefilter now has -t option for filter/train step * Options has new hammie_train_on_filter and hammie_trained_header options * hammie.py:Hammie.filter has new train kwarg to support filter/train in one step. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** Options.py 17 Jan 2003 22:23:59 -0000 1.6 --- Options.py 21 Jan 2003 14:50:26 -0000 1.7 *************** *** 343,347 **** # Name of a debugging header for spambayes hackers, showing the strongest # clues that have resulted in the classification in the standard header. ! hammie_debug_header_name: X-Hammie-Debug # The range of clues that are added to the "debug" header in the E-mail --- 343,358 ---- # Name of a debugging header for spambayes hackers, showing the strongest # clues that have resulted in the classification in the standard header. ! hammie_debug_header_name: X-Spambayes-Debug ! ! # Train when filtering? After filtering a message, hammie can then ! # train itself on the judgement (ham or spam). This can speed things up ! # with a procmail-based solution. If you do enable this, please make ! # sure to retrain any mistakes. Otherwise, your word database will ! # slowly become useless. ! hammie_train_on_filter: False ! ! # When training on a message, the name of the header to add with how it ! # was trained ! hammie_trained_header: X-Spambayes-Trained # The range of clues that are added to the "debug" header in the E-mail *************** *** 463,466 **** --- 474,479 ---- 'hammie_debug_header': boolean_cracker, 'hammie_debug_header_name': string_cracker, + 'hammie_train_on_filter': boolean_cracker, + 'hammie_trained_header': string_cracker, }, 'hammiefilter' : {'hammiefilter_persistent_use_database': boolean_cracker, Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/hammie.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** hammie.py 14 Jan 2003 05:38:20 -0000 1.2 --- hammie.py 21 Jan 2003 14:50:27 -0000 1.3 *************** *** 62,66 **** def filter(self, msg, header=None, spam_cutoff=None, ham_cutoff=None, debugheader=None, ! debug=None): """Score (judge) a message and add a disposition header. --- 62,66 ---- def filter(self, msg, header=None, spam_cutoff=None, ham_cutoff=None, debugheader=None, ! debug=None, train=None): """Score (judge) a message and add a disposition header. *************** *** 74,77 **** --- 74,82 ---- The name of the debugging header is given as 'debugheader'. + If 'train' is True, also train on the result of scoring the + message (ie. train as ham if it's ham, train as spam if it's + spam). You'll want to be very dilligent about retraining + mistakes if you use this. + All defaults for optional parameters come from the Options file. *************** *** 90,93 **** --- 95,100 ---- if debug == None: debug = options.hammie_debug_header + if train == None: + train = options.hammie_train_on_filter msg = mboxutils.get_message(msg) *************** *** 98,106 **** prob, clues = self._scoremsg(msg, True) if prob < ham_cutoff: ! disp = options.header_ham_string elif prob > spam_cutoff: ! disp = options.header_spam_string else: disp = options.header_unsure_string disp += ("; %."+str(options.header_score_digits)+"f") % prob if options.header_score_logarithm: --- 105,122 ---- prob, clues = self._scoremsg(msg, True) if prob < ham_cutoff: ! is_spam = False ! trained = options.header_ham_string ! disp = trained elif prob > spam_cutoff: ! is_spam = True ! trained = options.header_spam_string ! disp = trained else: + is_spam = False + trained = options.header_ham_string disp = options.header_unsure_string + if train: + self.train(msg, is_spam) + msg.add_header(options.hammie_trained_header, trained) disp += ("; %."+str(options.header_score_digits)+"f") % prob if options.header_score_logarithm: From richiehindle at users.sourceforge.net Tue Jan 21 10:23:45 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Tue Jan 21 13:23:49 2003 Subject: [Spambayes-checkins] spambayes/spambayes Dibbler.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv19715 Modified Files: Dibbler.py Log Message: You can now specify the local address to listen on, as well as the port. Thanks for Tony Lownds for the patch. Index: Dibbler.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Dibbler.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** Dibbler.py 17 Jan 2003 20:21:07 -0000 1.1 --- Dibbler.py 21 Jan 2003 18:23:43 -0000 1.2 *************** *** 235,239 **** connections when Dibbler.run is called: ! o port: The TCP/IP port to listen on o factory: The function to call to create a handler (can be a class --- 235,242 ---- connections when Dibbler.run is called: ! o port: The TCP/IP (address, port) to listen on. Usually '' - ! meaning bind to all IP addresses that the machine has - will be ! passed as the address. If `port` is just an int, an address of ! '' will be assumed. o factory: The function to call to create a handler (can be a class *************** *** 260,264 **** self.set_socket(s, self.socketMap) self.set_reuse_addr() ! self.bind(('', port)) self.listen(5) --- 263,269 ---- self.set_socket(s, self.socketMap) self.set_reuse_addr() ! if type(port) != type(()): ! port = ('', port) ! self.bind(port) self.listen(5) *************** *** 280,294 **** your content - see `HTTPPlugin` for detailed documentation and examples. ! `port` specifies the TCP/IP port on which to run, defaulting to port 80. `context` optionally specifies a `Dibbler.Context` for the server. """ ! def __init__(self, port=80, context=_defaultContext): """Create an `HTTPServer` for the given port.""" Listener.__init__(self, port, _HTTPHandler, (self, context), context._map) self._plugins = [] ! context._HTTPPort = port def register(self, *plugins): --- 285,303 ---- your content - see `HTTPPlugin` for detailed documentation and examples. ! `port` specifies the TCP/IP (address, port) on which to run, defaulting ! to ('', 80). `context` optionally specifies a `Dibbler.Context` for the server. """ ! def __init__(self, port=('', 80), context=_defaultContext): """Create an `HTTPServer` for the given port.""" Listener.__init__(self, port, _HTTPHandler, (self, context), context._map) self._plugins = [] ! try: ! context._HTTPPort = port[1] ! except TypeError: ! context._HTTPPort = port def register(self, *plugins): *************** *** 336,340 **** method, url, version = requestLine.strip().split() except ValueError: ! self.pushError(400, "Malformed request: '%s'" % requestLine) self.close_when_done() return --- 345,349 ---- method, url, version = requestLine.strip().split() except ValueError: ! self.writeError(400, "Malformed request: '%s'" % requestLine) self.close_when_done() return From richiehindle at users.sourceforge.net Tue Jan 21 10:25:18 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Tue Jan 21 13:28:57 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.37,1.38 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv20431 Modified Files: pop3proxy.py Log Message: You can now specify the local address to listen on, as well as the port. Thanks for Tony Lownds for the patch. I now use a neater mechanism for loading the HTML resources. Thanks to Mike Fletcher for the code. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** pop3proxy.py 20 Jan 2003 20:23:30 -0000 1.37 --- pop3proxy.py 21 Jan 2003 18:25:15 -0000 1.38 *************** *** 345,350 **** proxyArgs = (serverName, serverPort) Dibbler.Listener.__init__(self, proxyPort, BayesProxy, proxyArgs) ! print 'Listener on port %d is proxying %s:%d' % \ ! (proxyPort, serverName, serverPort) --- 345,350 ---- proxyArgs = (serverName, serverPort) Dibbler.Listener.__init__(self, proxyPort, BayesProxy, proxyArgs) ! print 'Listener on port %s is proxying %s:%d' % \ ! (_addressPortStr(proxyPort), serverName, serverPort) *************** *** 536,542 **** from spambayes.resources import ui_html images = {} ! for imageName in IMAGES: ! exec "from spambayes.resources import %s_gif" % imageName ! exec "images[imageName] = %s_gif.data" % imageName return ui_html.data, images --- 536,543 ---- from spambayes.resources import ui_html images = {} ! for baseName in IMAGES: ! moduleName = '%s.%s_gif' % ('spambayes.resources', baseName) ! module = __import__(moduleName, {}, {}, ('spambayes', 'resources')) ! images[baseName] = module.data return ui_html.data, images *************** *** 1067,1071 **** if options.pop3proxy_ports: splitPorts = options.pop3proxy_ports.split(',') ! self.proxyPorts = map(int, map(string.strip, splitPorts)) if len(self.servers) != len(self.proxyPorts): --- 1068,1072 ---- if options.pop3proxy_ports: splitPorts = options.pop3proxy_ports.split(',') ! self.proxyPorts = map(_addressAndPort, splitPorts) if len(self.servers) != len(self.proxyPorts): *************** *** 1098,1102 **** serverStrings = ["%s:%s" % (s, p) for s, p in self.servers] self.serversString = ', '.join(serverStrings) ! self.proxyPortsString = ', '.join(map(str, self.proxyPorts)) def createWorkers(self): --- 1099,1103 ---- serverStrings = ["%s:%s" % (s, p) for s, p in self.servers] self.serversString = ', '.join(serverStrings) ! self.proxyPortsString = ', '.join(map(_addressPortStr, self.proxyPorts)) def createWorkers(self): *************** *** 1157,1160 **** --- 1158,1180 ---- self.hamCorpus.addObserver(self.hamTrainer) + + # Option-parsing helper functions + def _addressAndPort(s): + """Decode a string representing a port to bind to, with optional address.""" + s = s.strip() + if ':' in s: + addr, port = s.split(':') + return addr, int(port) + else: + return '', int(s) + + def _addressPortStr((addr, port)): + """Encode a string representing a port to bind to, with optional address.""" + if not addr: + return str(port) + else: + return '%s:%d' % (addr, port) + + state = State() proxyListeners = [] *************** *** 1431,1435 **** options.pop3proxy_persistent_storage_file = arg elif opt == '-l': ! state.proxyPorts = [int(arg)] elif opt == '-u': state.uiPort = int(arg) --- 1451,1455 ---- options.pop3proxy_persistent_storage_file = arg elif opt == '-l': ! state.proxyPorts = [_addressAndPort(arg)] elif opt == '-u': state.uiPort = int(arg) From montanaro at users.sourceforge.net Tue Jan 21 13:19:12 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Tue Jan 21 16:19:15 2003 Subject: [Spambayes-checkins] spambayes/utilities loosecksum.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/utilities In directory sc8-pr-cvs1:/tmp/cvs-serv13727 Modified Files: loosecksum.py Log Message: reworked to more-or-less follow the ideas Justin Mason layed out in http://mail.python.org/pipermail/spambayes/2003-January/002912.html In particular, splitting it into four pieces which can be compared to saved values in isolation or in combination seems like it should help. Index: loosecksum.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/utilities/loosecksum.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** loosecksum.py 17 Jan 2003 06:42:53 -0000 1.2 --- loosecksum.py 21 Jan 2003 21:19:09 -0000 1.3 *************** *** 33,74 **** import binascii - def zaptags(data, *tags): - """delete all tags (and /tags) from input data given as arguments""" - for pat in tags: - pat = pat.split(":") - sub = "" - if len(pat) >= 2: - sub = pat[-1] - pat = ":".join(pat[:-1]) - else: - pat = pat[0] - sub = "" - if '\\' in sub: - sub = _zap_esc_map(sub) - try: - data = re.sub(r'(?i)]*)?>'%pat, sub, data) - except TypeError: - print (pat, sub, data) - raise - return data - - def clean(data): - """Clean the obviously variable stuff from a chunk of data. - - The first (and perhaps only) use of this is to try and eliminate bits - of data that keep multiple spam email messages from looking the same. - """ - # Get rid of any HTML tags that hold URLs - tend to have varying content - # I suppose i could just get rid of all HTML tags - data = zaptags(data, 'a', 'img', 'base', 'frame') - # delete anything that looks like an email address - data = re.sub(r"(?i)[-a-z0-9_.+]+@[-a-z0-9_.]+\.([a-z]+)", "", data) - # delete anything that looks like a url (catch bare urls) - data = re.sub(r"(?i)(ftp|http|gopher)://[-a-z0-9_/?&%@=+:;#!~|.,$*]+", "", data) - # delete pmguid: stuff (turns up frequently) - data = re.sub(r"pmguid:[^.\s]+(\.[^.\s]+)*", "", data) - # throw away everything other than alpha & digits - return re.sub(r"[^A-Za-z0-9]+", "", data) - def flatten(obj): # I do not know how to use the email package very well - all I want here --- 33,36 ---- *************** *** 85,90 **** def generate_checksum(f): ! body = flatten(email.Parser.Parser().parse(f)) ! return binascii.b2a_hex(md5.new(clean(body)).digest()) def main(args): --- 47,84 ---- def generate_checksum(f): ! data = flatten(email.Parser.Parser().parse(f)) ! ! # modelled after Justin Mason's fuzzy checksummer for SpamAssassin. ! # Message body is cleaned, then broken into lines. The list of lines is ! # then broken into four parts and separate checksums are generated for ! # each part. They are then joined together with '.'. Downstream ! # processes can split those chunks into pieces and consider them ! # separately or in various combinations if desired. ! ! # Get rid of anything which looks like an HTML tag and downcase it all ! data = re.sub(r"<[^>]*>", "", data).lower() ! ! # delete anything which looks like a url or email address ! # not sure what a pmguid: url is but it seems to occur frequently in spam ! words = [w for w in data.split(' ') ! if ('@' not in w and ! (':' not in w or ! w[:4] != "ftp:" and ! w[:7] != "mailto:" and ! w[:5] != "http:" and ! w[:7] != "gopher:" and ! w[:8] != "pmguid:"))] ! ! # delete lines which contain white space ! lines = [line for line in " ".join(words).split('\n') if ' ' in line] ! ! # +1 guarantees we don't miss lines at the end ! chunksize = len(lines)//4+1 ! sum = [] ! for i in range(4): ! chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) ! sum.append(binascii.b2a_hex(md5.new(chunk).digest())) ! ! return ".".join(sum) def main(args): From npickett at users.sourceforge.net Tue Jan 21 21:23:19 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Wed Jan 22 00:23:24 2003 Subject: [Spambayes-checkins] spambayes hammiefilter.py,1.10,1.11 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv13378 Modified Files: hammiefilter.py Log Message: * Fix function name in hammie.py * Expound upon docstrings in hammie.py * Options.py will now look for bayescustomize.ini and ~/.spambayesrc. Hopefully some non-Unix folks will update this with sensible defaults for their platforms. * hammiefilter has a ton of new options -- check the docstring Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** hammiefilter.py 21 Jan 2003 14:50:25 -0000 1.10 --- hammiefilter.py 22 Jan 2003 05:23:17 -0000 1.11 *************** *** 17,32 **** """Usage: %(program)s [OPTION]... - A hammie front-end to make the simple stuff simple. The intent is to call - this from procmail and its ilk like so: - - :0 fw - | hammiefilter.py - - Then, you can set up your MUA to pipe ham and spam to it, one at a time, by - calling it with either the -g or -s options, respectively. - [OPTION] is one of: -h show usage and exit -d DBFILE use database in DBFILE --- 17,25 ---- """Usage: %(program)s [OPTION]... [OPTION] is one of: -h show usage and exit + -x + show some usage examples and exit -d DBFILE use database in DBFILE *************** *** 35,53 **** -n create a new database ! -g ! train as a good (ham) message ! -s ! train as a bad (spam) message ! -t ! filter and train based on the result (you must make sure to ! untrain all mistakes later) ! -G ! untrain ham (only use if you've already trained this message) ! -S ! untrain spam (only use if you've already trained this message) ! All processing options operate on stdin. If no processing options are ! given, stdin will be scored: the same message, with a new header ! containing the score, will be send to stdout. """ --- 28,50 ---- -n create a new database ! * -f ! filter (default if no processing options are given) ! * -t ! [EXPERIMENTAL] filter and train based on the result (you must ! make sure to untrain all mistakes later) ! * -g ! [EXPERIMENTAL] (re)train as a good (ham) message ! * -s ! [EXPERIMENTAL] (re)train as a bad (spam) message ! * -G ! [EXPERIMENTAL] untrain ham (only use if you've already trained ! this message) ! * -S ! [EXPERIMENTAL] untrain spam (only use if you've already trained ! this message) ! All processing options (marked with *) operate on stdin. If no ! processing options are given, stdin will be scored: the same message, ! with a new header containing the score, will be send to stdout. """ *************** *** 61,64 **** --- 58,89 ---- program = sys.argv[0] + example_doc = """_Examples_ + + filter a message on disk: + %(program)s < message + + (re)train a message as ham: + %(program)s -g < message + + (re)train a message as spam: + %(program)s -s < message + + + procmail recipie to filter and train in one step: + :0 fw + | %(program)s -t + + + mutt configuration. This binds the 'H' key to retrain the message as + ham, and prompt for a folder to move it to. The 'S' key retrains as + spam, and moves to a 'spam' folder. + XXX: add this + + """ + + def examples(): + print example_doc % globals() + sys.exit(0) + def usage(code, msg=''): """Print usage message and sys.exit(code).""" *************** *** 81,102 **** h = hammie.open(self.dbname, self.usedb, 'n') h.store() ! print "Created new database in", self.dbname def filter(self, msg): h = hammie.open(self.dbname, self.usedb, 'r') ! print h.filter(msg) def filter_train(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') ! print h.filter(msg, train=True) def train_ham(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') ! h.train_ham(msg) h.store() def train_spam(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') ! h.train_spam(msg) h.store() --- 106,127 ---- h = hammie.open(self.dbname, self.usedb, 'n') h.store() ! print >> sys.stderr, "Created new database in", self.dbname def filter(self, msg): h = hammie.open(self.dbname, self.usedb, 'r') ! return h.filter(msg) def filter_train(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') ! return h.filter(msg, train=True) def train_ham(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') ! h.train_ham(msg, True) h.store() def train_spam(self, msg): h = hammie.open(self.dbname, self.usedb, 'c') ! h.train_spam(msg, True) h.store() *************** *** 114,121 **** h = HammieFilter() actions = [] ! opts, args = getopt.getopt(sys.argv[1:], 'hd:D:ngstGS', ['help']) for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt == '-d': h.usedb = True --- 139,148 ---- h = HammieFilter() actions = [] ! opts, args = getopt.getopt(sys.argv[1:], 'hxd:D:nfgstGS', ['help', 'examples']) for opt, arg in opts: if opt in ('-h', '--help'): usage(0) + elif opt in ('-x', '--examples'): + examples() elif opt == '-d': h.usedb = True *************** *** 124,127 **** --- 151,156 ---- h.usedb = False h.dbname = arg + elif opt == '-f': + actions.append(h.filter) elif opt == '-g': actions.append(h.train_ham) *************** *** 144,148 **** for action in actions: action(msg) ! if __name__ == "__main__": --- 173,177 ---- for action in actions: action(msg) ! sys.stdout.write(msg.as_string(unixfrom=(msg.get_unixfrom() is not None))) if __name__ == "__main__": From npickett at users.sourceforge.net Tue Jan 21 21:23:20 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Wed Jan 22 00:23:28 2003 Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.7,1.8 hammie.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv13378/spambayes Modified Files: Options.py hammie.py Log Message: * Fix function name in hammie.py * Expound upon docstrings in hammie.py * Options.py will now look for bayescustomize.ini and ~/.spambayesrc. Hopefully some non-Unix folks will update this with sensible defaults for their platforms. * hammiefilter has a ton of new options -- check the docstring Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** Options.py 21 Jan 2003 14:50:26 -0000 1.7 --- Options.py 22 Jan 2003 05:23:17 -0000 1.8 *************** *** 557,559 **** options.mergefiles(alternate.split()) else: ! options.mergefiles(['bayescustomize.ini']) --- 557,565 ---- options.mergefiles(alternate.split()) else: ! alts = [] ! for path in ['bayescustomize.ini', '~/.spambayesrc']: ! epath = os.path.expanduser(path) ! if os.path.exists(epath): ! alts.append(epath) ! if alts: ! options.mergefiles(alts) Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/hammie.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** hammie.py 21 Jan 2003 14:50:27 -0000 1.3 --- hammie.py 22 Jan 2003 05:23:17 -0000 1.4 *************** *** 76,81 **** If 'train' is True, also train on the result of scoring the message (ie. train as ham if it's ham, train as spam if it's ! spam). You'll want to be very dilligent about retraining ! mistakes if you use this. All defaults for optional parameters come from the Options file. --- 76,82 ---- If 'train' is True, also train on the result of scoring the message (ie. train as ham if it's ham, train as spam if it's ! spam). If the message already has a trained header, it will be ! untrained first. You'll want to be very dilligent about ! retraining mistakes if you use this option. All defaults for optional parameters come from the Options file. *************** *** 103,122 **** except KeyError: pass prob, clues = self._scoremsg(msg, True) if prob < ham_cutoff: is_spam = False ! trained = options.header_ham_string ! disp = trained elif prob > spam_cutoff: is_spam = True ! trained = options.header_spam_string ! disp = trained else: is_spam = False - trained = options.header_ham_string disp = options.header_unsure_string if train: ! self.train(msg, is_spam) ! msg.add_header(options.hammie_trained_header, trained) disp += ("; %."+str(options.header_score_digits)+"f") % prob if options.header_score_logarithm: --- 104,121 ---- except KeyError: pass + if train: + self.untrain_from_header(msg) prob, clues = self._scoremsg(msg, True) if prob < ham_cutoff: is_spam = False ! disp = options.header_ham_string elif prob > spam_cutoff: is_spam = True ! disp = options.header_spam_string else: is_spam = False disp = options.header_unsure_string if train: ! self.train(msg, is_spam, True) disp += ("; %."+str(options.header_score_digits)+"f") % prob if options.header_score_logarithm: *************** *** 129,139 **** x=-math.log10(1.0-prob) disp += " (%d)"%x msg.add_header(header, disp) if debug: disp = self.formatclues(clues) msg.add_header(debugheader, disp) return msg.as_string(unixfrom=(msg.get_unixfrom() is not None)) ! def train(self, msg, is_spam): """Train bayes with a message. --- 128,140 ---- x=-math.log10(1.0-prob) disp += " (%d)"%x + del msg[header] msg.add_header(header, disp) if debug: disp = self.formatclues(clues) + del msg[debugheader] msg.add_header(debugheader, disp) return msg.as_string(unixfrom=(msg.get_unixfrom() is not None)) ! def train(self, msg, is_spam, add_header=False): """Train bayes with a message. *************** *** 142,148 **** --- 143,159 ---- is_spam should be 1 if the message is spam, 0 if not. + If add_header is True, add a header with how it was trained (in + case we need to untrain later) + """ self.bayes.learn(tokenize(msg), is_spam) + if add_header: + if is_spam: + trained = options.header_spam_string + else: + trained = options.header_ham_string + del msg[options.hammie_trained_header] + msg.add_header(options.hammie_trained_header, trained) def untrain(self, msg, is_spam): *************** *** 151,155 **** msg can be a string, a file object, or a Message object. ! is_spam should be 1 if the message is spam, 0 if not. """ --- 162,166 ---- msg can be a string, a file object, or a Message object. ! is_spam should be True if the message is spam, False if not. """ *************** *** 157,180 **** self.bayes.unlearn(tokenize(msg), is_spam) ! def train_ham(self, msg): """Train bayes with ham. msg can be a string, a file object, or a Message object. """ ! self.train(msg, False) ! def train_spam(self, msg): """Train bayes with spam. msg can be a string, a file object, or a Message object. """ ! self.train(msg, True) def untrain_ham(self, msg): ! """Untrain bayes with ham. msg can be a string, a file object, or a Message object. --- 168,222 ---- self.bayes.unlearn(tokenize(msg), is_spam) ! def untrain_from_header(self, msg): ! """Untrain bayes based on X-Spambayes-Trained header. ! ! msg can be a string, a file object, or a Message object. ! ! If no such header is present, nothing happens. ! ! If add_header is True, add a header with how it was trained (in ! case we need to untrain later) ! ! """ ! ! msg = mboxutils.get_message(msg) ! trained = msg.get(options.hammie_trained_header) ! if not trained: ! return ! del msg[options.hammie_trained_header] ! if trained == options.header_ham_string: ! self.untrain_ham(msg) ! elif trained == options.header_spam_string: ! self.untrain_spam(msg) ! else: ! raise ValueError('%s header value unrecognized' ! % options.hammie_trained_header) ! ! def train_ham(self, msg, add_header=False): """Train bayes with ham. msg can be a string, a file object, or a Message object. + If add_header is True, add a header with how it was trained (in + case we need to untrain later) + """ ! self.train(msg, False, add_header) ! def train_spam(self, msg, add_header=False): """Train bayes with spam. msg can be a string, a file object, or a Message object. + If add_header is True, add a header with how it was trained (in + case we need to untrain later) + """ ! self.train(msg, True, add_header) def untrain_ham(self, msg): ! """Untrain bayes with a message previously trained as ham. msg can be a string, a file object, or a Message object. *************** *** 184,189 **** self.untrain(msg, False) ! def train_spam(self, msg): ! """Untrain bayes with spam. msg can be a string, a file object, or a Message object. --- 226,231 ---- self.untrain(msg, False) ! def untrain_spam(self, msg): ! """Untrain bayes with a message previously traned as spam. msg can be a string, a file object, or a Message object. From anthonybaxter at users.sourceforge.net Wed Jan 22 00:30:08 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Wed Jan 22 03:30:12 2003 Subject: [Spambayes-checkins] website background.ht,1.12,1.13 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv9445 Modified Files: background.ht Log Message: suckered tim into giving a succinct description of the CLT schemes. Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** background.ht 17 Jan 2003 17:00:51 -0000 1.12 --- background.ht 22 Jan 2003 08:30:05 -0000 1.13 *************** *** 116,131 **** scoring word probabilities. The initial one, after much back and forth in the mailing list, is in the code today as 'gary_combining', and is ! the second plot, above.. A couple ! of other approaches, using the Central Limit Theorem (or this, for the serious math geeks), were also tried.

    !

    todo: do some plots for these

    ! They produced interesting output - but histograms of the ham and spam ! distributions still had a disturbingly large overlap in the middle. There was ! also an issue with incremental training and untraining of messages that ! made it harder to use in the "real world". These two central limit approaches were dropped after Tim, Gary and Rob Hooft produced a combining scheme using chi-squared probabilities. This is now the default combining scheme.

    !

    The chi-squared approach produces two numbers - a "ham probability" ("*H*") and a "spam probability" ("*S*"). A typical spam will have a high *S* and low *H*, while a ham will have high *H* and low *S*. In the case where --- 116,149 ---- scoring word probabilities. The initial one, after much back and forth in the mailing list, is in the code today as 'gary_combining', and is ! the second plot, above. Gary's next suggestion involved a couple ! of other approaches using the Central Limit Theorem (or this, for the serious math geeks). !

    ! !

    The Central Limit combining schemes produced some interesting (and ! suprising!) results - they produced two internal scores, one for ham and ! one for spam. This meant it was possible for them to return a "I don't know" ! response, when ham and spam scores were both very low or both very high. This ! caused some confusion as we tried to map these results to a Graham-like score. !

    !

    ! An example: a message with internal spam score that's 50 standard deviations ! on the spam side of the ham mean score and an internal ham score that's ! 40 standard deviations on the ham side of the spam mean would, if you just ! combine them in a straightforward manner, produce a result that it's ! definitely a spam. But look at the internal scores - it was certain that it ! wasn't spam, and it wasn't ham, either. In other words, it's not ! like anything it's seen before - so the only thing to do is to punt it out ! with an 'unsure' answer. !

    ! These two central limit approaches were dropped after Tim, Gary and Rob Hooft produced a combining scheme using chi-squared probabilities. This is now the default combining scheme.

    !

    Chi-combining is similar to the central limit approaches, but it doesn't ! have the annoying training problems that central limit approaches suffered ! from, and it produces a "smoother" score. !

    !

    The chi-squared approach produces two numbers - a "ham probability" ("*H*") and a "spam probability" ("*S*"). A typical spam will have a high *S* and low *H*, while a ham will have high *H* and low *S*. In the case where From richiehindle at users.sourceforge.net Wed Jan 22 10:29:15 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Wed Jan 22 13:29:18 2003 Subject: [Spambayes-checkins] spambayes/spambayes PyMeldLite.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv28328 Modified Files: PyMeldLite.py Log Message: Cope with characters not allowed in XML. Thanks to Jürgen Hermann for the patch. Removed the half-arsed attempt at Unicode support - it's unnecessary and it never really worked. Index: PyMeldLite.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/PyMeldLite.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** PyMeldLite.py 20 Jan 2003 23:22:02 -0000 1.3 --- PyMeldLite.py 22 Jan 2003 18:29:11 -0000 1.4 *************** *** 223,226 **** --- 223,232 ---- nonSelfClose = {'textarea': None} + # Map characters not allowed in XML content to '?' + import string + badxml_chars = ''.join([chr(c) for c in range(0, 32) + range(128, 160) + if c not in [9, 10, 13]]) + badxml_map = string.maketrans(badxml_chars, '?' * len(badxml_chars)) + ########################################################################### *************** *** 447,473 **** - # Versions of xmllib from 1.5.2 to 2.2.2 (and probably beyond) are - # broken with respect to unicode, in that they use string.maketrans(). - # If the xmllib we're using looks broken, we create an object that will - # fix it, and transiently apply the fix when parsing unicode XML. - fixedAttrtrans = None - if hasattr(xmllib, 'attrtrans') and isinstance(xmllib.attrtrans, str): - class UnicodeAttrtrans: - def __getitem__(self, c): - if unichr(c) in ' \r\n\t': - return ord(u' ') - return c - fixedAttrtrans = UnicodeAttrtrans() - - def _generateTree(source): """Given some XML source, generates a lightweight DOM tree rooted at a `_RootNode`.""" - # Fix xmllib if necessary. - if isinstance(source, unicode) and fixedAttrtrans: - originalAttrtrans = xmllib.attrtrans - xmllib.attrtrans = fixedAttrtrans - # Lots of HTML files start with a DOCTYPE declaration like this: # --- 453,460 ---- *************** *** 493,496 **** --- 480,486 ---- source[match.end(1):] + # Map characters not allowed in XML content to '?' + source = source.translate(badxml_map) + # Parse the XML and generate the tree. g = _TreeGenerator() *************** *** 498,505 **** g.close() - # Put xmllib back again. - if isinstance(source, unicode) and fixedAttrtrans: - xmllib.attrtrans = originalAttrtrans - # Get the tree and put the DOCTYPE back in if we hacked it out above. tree = g.getTree() --- 488,491 ---- *************** *** 532,541 **** self._readonly = readonly ! if isinstance(source, (str, unicode)): self._tree = _generateTree(source) elif isinstance(source, _Node): # For internal use only. self._tree = source else: ! raise TypeError, "Melds must be constructed from strings" def _findByID(self, node, name): --- 518,527 ---- self._readonly = readonly ! if isinstance(source, str): self._tree = _generateTree(source) elif isinstance(source, _Node): # For internal use only. self._tree = source else: ! raise TypeError, "Melds must be constructed from ASCII strings" def _findByID(self, node, name): *************** *** 551,555 **** """Minimally quotes an attribute value, using `"`, `&`, `<` and `>`.""" ! if not isinstance(value, (str, unicode)): value = str(value) value = value.replace('"', '"') --- 537,541 ---- """Minimally quotes an attribute value, using `"`, `&`, `<` and `>`.""" ! if not isinstance(value, str): value = str(value) value = value.replace('"', '"') *************** *** 577,581 **** node.children = [value._tree.getElementNode().clone()] else: ! if not isinstance(value, (str, unicode)): value = str(value) node.children = self._nodeListFromSource(value) --- 563,567 ---- node.children = [value._tree.getElementNode().clone()] else: ! if not isinstance(value, str): value = str(value) node.children = self._nodeListFromSource(value) *************** *** 774,778 **** sequence = values.values() elif hasattr(values, '__getitem__') and \ ! not isinstance(values, (str, unicode)): # It's a sequence. keys = None --- 760,764 ---- sequence = values.values() elif hasattr(values, '__getitem__') and \ ! not isinstance(values, str): # It's a sequence. keys = None *************** *** 843,852 **** return str(self._tree.toText()) - def __unicode__(self): - """Returns the XML that this `Meld` represents. Don't call - this directly - instead convert a `Meld` to unicode using - `unicode(object)`.""" - return unicode(self._tree.toText()) - ########################################################################### --- 829,832 ---- *************** *** 978,982 **** Traceback (most recent call last): ... ! TypeError: Melds must be constructed from strings """, --- 958,962 ---- Traceback (most recent call last): ... ! TypeError: Melds must be constructed from ASCII strings """, *************** *** 1059,1074 **** """, ! # This is just a smoke-test; proper Unicode support is untested, though ! # the code does attempt to be unicode-friendly, and to work around a ! # unicode-related bug in xmllib. ! 'unicode': r""" >>> u = Meld(u'One') ! >>> a = Meld('Two') ! >>> u.one = a.two ! >>> print repr(unicode(u)) ! u'Two' ! >>> a.two = Meld(u'') ! >>> print a ! """, --- 1039,1047 ---- """, ! 'no unicode': r""" >>> u = Meld(u'One') ! Traceback (most recent call last): ! ... ! TypeError: Melds must be constructed from ASCII strings """, *************** *** 1092,1095 **** --- 1065,1078 ---- x """, + + 'bad XML characters': """ + >>> page = Meld(''' + ... Valentines Day Special \x96 2 bikinis for the price of one + ... ''') # No exception. + >>> print page + + Valentines Day Special ? 2 bikinis for the price of one + + """ } From richiehindle at users.sourceforge.net Wed Jan 22 10:33:40 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Wed Jan 22 13:33:43 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.38,1.39 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv622 Modified Files: pop3proxy.py Log Message: Made the self-test work again after applying Tony's server-address patch. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** pop3proxy.py 21 Jan 2003 18:25:15 -0000 1.38 --- pop3proxy.py 22 Jan 2003 18:33:35 -0000 1.39 *************** *** 1372,1376 **** proxyUI = UserInterface() httpServer.register(proxyUI, OptionsConfigurator(proxyUI)) ! BayesProxyListener('localhost', 8110, 8111) state.bayes.learn(tokenizer.tokenize(spam1), True) state.bayes.learn(tokenizer.tokenize(good1), False) --- 1372,1376 ---- proxyUI = UserInterface() httpServer.register(proxyUI, OptionsConfigurator(proxyUI)) ! BayesProxyListener('localhost', 8110, ('', 8111)) state.bayes.learn(tokenizer.tokenize(spam1), True) state.bayes.learn(tokenizer.tokenize(good1), False) From npickett at users.sourceforge.net Wed Jan 22 20:44:40 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Wed Jan 22 23:44:44 2003 Subject: [Spambayes-checkins] spambayes/contrib - New directory Message-ID: Update of /cvsroot/spambayes/spambayes/contrib In directory sc8-pr-cvs1:/tmp/cvs-serv27720/contrib Log Message: Directory /cvsroot/spambayes/spambayes/contrib added to the repository From npickett at users.sourceforge.net Wed Jan 22 20:46:29 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Wed Jan 22 23:46:32 2003 Subject: [Spambayes-checkins] spambayes runtest.sh,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv28017 Modified Files: runtest.sh Log Message: * Fixed runtest.sh to handle new paths for all the utilities * moved hammie/* to contrib/* * new spambayes.el for Gnus integration Index: runtest.sh =================================================================== RCS file: /cvsroot/spambayes/spambayes/runtest.sh,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** runtest.sh 10 Oct 2002 09:21:55 -0000 1.9 --- runtest.sh 23 Jan 2003 04:46:27 -0000 1.10 *************** *** 1,3 **** ! #! /bin/sh -x ## ## runtest.sh -- run some tests for Tim --- 1,3 ---- ! #! /bin/sh -e ## ## runtest.sh -- run some tests for Tim *************** *** 20,23 **** --- 20,31 ---- fi + # Include local directory in Python path + if [ -n "$PYTHONPATH" ]; then + PYTHONPATH=$PYTHONPATH:. + else + PYTHONPATH=. + fi + export PYTHONPATH + # Which test to run TEST=${1:-run2} *************** *** 42,70 **** esac if [ -n "$REBAL" ]; then # Put them all into reservoirs ! python rebal.py -r Data/Ham/reservoir -s Data/Ham/Set -n 0 -q ! python rebal.py -r Data/Spam/reservoir -s Data/Spam/Set -n 0 -q # Rebalance ! python rebal.py -r Data/Ham/reservoir -s Data/Ham/Set -n $RNUM -q -Q ! python rebal.py -r Data/Spam/reservoir -s Data/Spam/Set -n $RNUM -q -Q fi case "$TEST" in test1) ! python timtest.py -n $SETS > test1.txt ;; test2) ! python timtest.py -n $SETS > test2.txt ;; timcv1|cv1) ! python timcv.py -n $SETS > cv1.txt ;; timcv2|cv2) ! python timcv.py -n $SETS > cv2.txt ! python rates.py cv1 cv2 > runrates.txt ! python cmp.py cv1s cv2s | tee results.txt ;; *) --- 50,80 ---- esac + set -x + if [ -n "$REBAL" ]; then # Put them all into reservoirs ! python utilities/rebal.py -r Data/Ham/reservoir -s Data/Ham/Set -n 0 -q ! python utilities/rebal.py -r Data/Spam/reservoir -s Data/Spam/Set -n 0 -q # Rebalance ! python utilities/rebal.py -r Data/Ham/reservoir -s Data/Ham/Set -n $RNUM -q -Q ! python utilities/rebal.py -r Data/Spam/reservoir -s Data/Spam/Set -n $RNUM -q -Q fi case "$TEST" in test1) ! python testtools/timtest.py -n $SETS > test1.txt ;; test2) ! python testtools/timtest.py -n $SETS > test2.txt ;; timcv1|cv1) ! python testtools/timcv.py -n $SETS > cv1.txt ;; timcv2|cv2) ! python testtools/timcv.py -n $SETS > cv2.txt ! python testtools/rates.py cv1 cv2 > runrates.txt ! python testtools/cmp.py cv1s cv2s | tee results.txt ;; *) From npickett at users.sourceforge.net Wed Jan 22 20:46:29 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Wed Jan 22 23:46:36 2003 Subject: [Spambayes-checkins] spambayes/hammie BULK.txt,1.1,NONE bulkgraph.py,1.1,NONE bulktrain.sh,1.1,NONE procmailrc,1.1,NONE Message-ID: Update of /cvsroot/spambayes/spambayes/hammie In directory sc8-pr-cvs1:/tmp/cvs-serv28017/hammie Removed Files: BULK.txt bulkgraph.py bulktrain.sh procmailrc Log Message: * Fixed runtest.sh to handle new paths for all the utilities * moved hammie/* to contrib/* * new spambayes.el for Gnus integration --- BULK.txt DELETED --- --- bulkgraph.py DELETED --- --- bulktrain.sh DELETED --- --- procmailrc DELETED --- From npickett at users.sourceforge.net Wed Jan 22 20:46:29 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Wed Jan 22 23:46:39 2003 Subject: [Spambayes-checkins] spambayes/contrib BULK.txt,NONE,1.1 bulkgraph.py,NONE,1.1 bulktrain.sh,NONE,1.1 procmailrc,NONE,1.1 spambayes.el,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/contrib In directory sc8-pr-cvs1:/tmp/cvs-serv28017/contrib Added Files: BULK.txt bulkgraph.py bulktrain.sh procmailrc spambayes.el Log Message: * Fixed runtest.sh to handle new paths for all the utilities * moved hammie/* to contrib/* * new spambayes.el for Gnus integration --- NEW FILE: BULK.txt --- Alex's spambayes filter scripts ------------------------------- I've finally started using spambayes for my incoming mail filtering. I've got a slightly unusual setup, so I had to write a couple scripts to deal with the nightly retraining... First off, let me describe how I've got things set up. I am an avid (and rather religious) MH user, so my mail folders are of course stored in the MH format (directories full of single-message files, where the filenames are numbers indicating ordering in the folder). I've got four mail folders of interest for this discussion: everything, spam, newspam, and inbox. When mail arrives, it is classified, then immediately copied in the everything folder. If it was classified as spam or ham, it is trained as such, reinforcing the classification. Then, if it was labeled as spam, it goes into the newspam folder; otherwise it goes into my inbox. When I read my mail (from inbox or newspam), I move any confirmed spam into my spam folder; ham may be deleted. (Of course, I still have a copy of my ham in the everything folder.) Every night, I run a complete retraining (from cron at 2:10am); it trains on all mail in the everything folder that is less than 4 months old. If a given message has an identical copy in the spam or newspam folder, then it is trained as spam; otherwise it is trained as ham. This does mean that unread unsures will be treated as ham for up to a day; there's few enough of them that I don't care. The four-month age limit will have the effect of expiring old mail out of the training set, which will keep the database size fairly manageable (it's currently just under 10 meg, with 6 days to go until I have 4 months of data). The retraining generates a little report for me each night, showing a graph of my ham and spam levels over time. Here's a sample: | Scanning spamdir (/home/cashew/popiel/Mail/spam): | Scanning spamdir (/home/cashew/popiel/Mail/newspam): | Scanning everything | sshsshsshsshsshsshsshshsshshshshsshshshshshshsshsshshsshssshsshshsshshsshshs | sshshshshsshshsshshshshshssshshshsshsshsshshshshshshsshshhshshsshshshshssshs | sshshsssshs | 154 | 152| | 144| | 136| | 128| h | 120| h s | 112| s ss ss s h s ss | 104| ss ss ss sHs h s ss | 96| s ss s sH s ss sHs h Sss ss | 88| h ss s sss ss sH sss ssssHHhS sSsssss | 80| s sSH ss ssssss sssssH HssssHsHHHSS sSsssss | 72| ssHSH ssssssssssssHHsHSHssHsHsHHHSSssSsssss | 64| s s s s sHsHSHsssssssHsHsssHHsHSHssHsHsHHHSSssSsssss | 56| s sss ss sssssHHHSHsHsssHsHHHHssHHsHSHHsHHHsHHHSSsHSsssss | 48| ssssssssssssssHHHSHHHHssHsHHHHHsHHsHSHHsHHHsHHHSSsHSssHsss | 40| ssssssssssHsHHHHHSHHHHHsHsHHHHHHHHHHSHHsHHHHHHHSSsHSHsHHss | 32| ssHHssHsssHHHHHHHSHHHHHHHsHHHHHHHHHHSHHsHHHHHHHSSHHSHHHHHs | 24| ssHHHHHHHsHHHHHHHSHHHHHHHsHHHHHHHHHHSHHHHHHHHHHSSHHSHHHHHs | 16| HsHHHHHHHHHHHHHHHSHHHHHHHHHHHHHHHHHHSHHHHHHHHHHSSHHSHHHHHs | 8| HHHHHHHHHHHHHHHHHSHHHHHHHHHHHHHHHHHHSHHHHHHHHHHSSHHSHHHHHH | 0|SSSUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU | +------------------------------------------------------------ | | Total: 6441 ham, 9987 spam (60.79% spam) | | real 7m45.049s | user 5m38.980s | sys 0m39.170s At the top of the output it mentions what it's scanning, and has a long line of s and h indicating progress (so it doesn't look hung if you run it by hand). Below is a set of overlaid bar graphs; s is for spam, h is for ham, u is unsure. The shorter bars are in front and capitalized. In the example, I have very few days where I have more ham than spam. Finally, there's the amount of time it took to run the retraining. My scripts are: bulkgraph.py read and train on messages, and generate the graph bulktrain.sh wrapper for bulkgraph.py, times the process and moves databases around procmailrc a slightly edited version of my .procmailrc file When I actually use this, I put bulkgraph.py and bulktrain.py in the root of my spambayes tree. Minor tweaks would probably make this unnecessary, but as a python newbie I don't know what they are off the top of my head, and I can't be bothered to find out. ;-) --- NEW FILE: bulkgraph.py --- #! /usr/bin/env python ### Train spambayes on messages in an MH mailbox, with spam identified ### by identical copies in other designated MH mailboxes. ### ### Run this from a cron job on your server. """Usage: %(program)s [OPTIONS] ... Where OPTIONS is one or more of: -h show usage and exit -d DBNAME use the DBM store. A DBM file is larger than the pickle and creating it is slower, but loading it is much faster, especially for large word databases. Recommended for use with hammiefilter or any procmail-based filter. -D DBNAME use the pickle store. A pickle is smaller and faster to create, but much slower to load. Recommended for use with pop3proxy and hammiesrv. -g PATH mbox or directory of known good messages (non-spam) to train on. Can be specified more than once. -s PATH mbox or directory of known spam messages to train on. Can be specified more than once. -f force training, ignoring the trained header. Use this if you need to rebuild your database from scratch. -q quiet mode; no output """ import mboxutils import getopt import hammie import sys import os import re import time import filecmp program = sys.argv[0] loud = True day = 24 * 60 * 60 # The following are in days expire = 4 * 30 grouping = 2 def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def row(value, spamday, hamday, unsureday): line = "%5d|" % value for j in range((expire) // grouping, -1, -1): spamv = 0 hamv = 0 unsurev = 0 for k in range(j * grouping, (j + 1) * grouping): try: spamv += spamday[k] hamv += hamday[k] unsurev += unsureday[k] except: pass spamv = spamv // grouping hamv = hamv // grouping unsurev = unsurev // grouping # print "%d: %ds %dh %du" % (j, spamv, hamv, unsurev) count = 0 char = ' ' if spamv >= value: count += 1 char = 's' if hamv >= value: count += 1 if (char == ' ' or hamv < spamv): char = 'h' if unsurev >= value: count += 1 if (char == ' ' or (char == 's' and unsurev < spamv) or (char == 'h' and unsurev < hamv)): char = 'u' if count > 1: char = char.upper() line += char return line def main(): """Main program; parse options and go.""" global loud try: opts, args = getopt.getopt(sys.argv[1:], 'hfqd:D:s:e:') except getopt.error, msg: usage(2, msg) if not opts: usage(2, "No options given") pck = None usedb = None force = False everything = None spam = [] for opt, arg in opts: if opt == '-h': usage(0) elif opt == "-f": force = True elif opt == "-q": loud = False elif opt == '-e': everything = arg elif opt == '-s': spam.append(arg) elif opt == "-d": usedb = True pck = arg elif opt == "-D": usedb = False pck = arg if args: usage(2, "Positional arguments not allowed") if usedb == None: usage(2, "Must specify one of -d or -D") h = hammie.open(pck, usedb, "c") spamsizes = {} for s in spam: if loud: print "Scanning spamdir (%s):" % s files = os.listdir(s) for f in files: if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): name = os.path.join(s, f) size = os.stat(name).st_size try: spamsizes[size].append(name) except KeyError: spamsizes[size] = [name] skipcount = 0 spamcount = 0 hamcount = 0 spamday = [0] * expire hamday = [0] * expire unsureday = [0] * expire date_re = re.compile( r";.* (\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{2,4})") now = time.mktime(time.strptime(time.strftime("%d %b %Y"), "%d %b %Y")) if loud: print "Scanning everything" for f in os.listdir(everything): if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): name = os.path.join(everything, f) fh = file(name, "rb") msg = mboxutils.get_message(fh) fh.close() # Figure out how old the message is age = 2 * expire try: received = (msg.get_all("Received"))[0] received = date_re.search(received).group(1) # if loud: print " %s" % received date = time.mktime(time.strptime(received, "%d %b %Y")) # if loud: print " %d" % date age = (now - date) // day # Can't just continue here... we're in a try if age < 0: age = 2 * expire except: pass # Skip anything that has no date or is too old or from the future # if loud: print "%s: %d" % (name, age) if age >= expire: skipcount += 1 if loud and not (skipcount % 100): sys.stdout.write("-") sys.stdout.flush() continue age = int(age) try: if msg.get("X-Spambayes-Classification").find("unsure") >= 0: unsureday[age] += 1 except: pass size = os.stat(name).st_size isspam = False try: for s in spamsizes[size]: if filecmp.cmp(name, s): isspam = True except KeyError: pass if isspam: spamcount += 1 spamday[age] += 1 if loud and not (spamcount % 100): sys.stdout.write("s") sys.stdout.flush() else: hamcount += 1 hamday[age] += 1 if loud and not (hamcount % 100): sys.stdout.write("h") sys.stdout.flush() h.train(msg, isspam) if loud: print mval = max(max(spamday), max(hamday), max(unsureday)) scale = (mval + 19) // 20 print "%5d" % mval for j in range(19, -1, -1): print row(scale * j, spamday, hamday, unsureday) print " +" + ('-' * 60) print print "Total: %d ham, %d spam (%.2f%% spam)" % ( hamcount, spamcount, spamcount * 100.0 / (hamcount + spamcount)) h.store() if __name__ == "__main__": main() --- NEW FILE: bulktrain.sh --- #!/bin/bash cd $HOME/spambayes/active/spambayes rm -f tmpdb 2>/dev/null time /usr/bin/python2.2 bulkgraph.py \ -d tmpdb \ -e $HOME/Mail/everything/ \ -s $HOME/Mail/spam \ -s $HOME/Mail/newspam \ && mv -f tmpdb hammiedb ls -l hammiedb --- NEW FILE: procmailrc --- MAILDIR=/home/cashew/popiel/Mail HOME=/home/cashew/popiel # Classify message (up here so all copies have the classification) :0fw: | /usr/bin/python2.2 $HOME/spambayes/active/spambayes/hammiefilter.py # And trust the classification :0Hc: * ^X-Spambayes-Classification: ham | /usr/bin/python2.2 $HOME/spambayes/active/spambayes/hammiefilter.py -g :0Hc: * ^X-Spambayes-Classification: spam | /usr/bin/python2.2 $HOME/spambayes/active/spambayes/hammiefilter.py -s # Save all mail for analysis :0c: everything/. # Block spam :0H: * ^Content-Type:.*text/html newspam/. :0H: * ^X-Spambayes-Classification: spam newspam/. # Put mail from myself in outbox :0H: * ^From:.*popiel\@wolfskeep outbox/. # Everything else is presumably good :0: inbox/. --- NEW FILE: spambayes.el --- ;; spambayes.el -- integrate spambayes into Gnus ;; Copyright (C) 2003 Neale Pickett ;; Time-stamp: <2003-01-21 20:54:15 neale> ;; This is free software; you can redistribute it and/or modify it under ;; the terms of the GNU General Public License as published by the Free ;; Software Foundation; either version 2, or (at your option) any later ;; version. ;; This program is distributed in the hope that it will be useful, but ;; WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to the ;; Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. ;; Purpose: ;; ;; Functions to put spambayes into Gnus. ;; ;; This binds "B s" to "refile as spam", and "B h" to "refile as ham". ;; After refiling, the message is rescored and respooled. I haven't yet ;; run across a case where refiling doesn't change a message's score ;; well into the ham or spam range. If this happens to you, please let ;; me know. ;; Installation: ;; ;; To install, just drop this file in your load path, and insert the ;; following lines in ~/.gnus: ;; ;; (load-library "spambayes") ;; (add-hook ;; 'gnus-sum-load-hook ;; (lambda nil ;; (define-key gnus-summary-mode-map [(B) (s)] 'spambayes-refile-as-spam) ;; (define-key gnus-summary-mode-map [(B) (h)] 'spambayes-refile-as-ham))) ;; (defvar spambayes-spam-group "spam" "Group name for spam messages") (defvar spambayes-hammiefilter "~/src/spambayes/hammiefilter.py" "Path to the hammiefilter program") (defun spambayes-retrain (args) "Retrain on all processable articles, or the one under the cursor. This will replace the buffer contents with command output." (labels ((do-exec (n g args) (with-temp-buffer (gnus-request-article-this-buffer n g) (shell-command-on-region (point-min) (point-max) (concat spambayes-hammiefilter " " args) (current-buffer) t) (gnus-request-replace-article n g (current-buffer))))) (let ((g gnus-newsgroup-name) (list gnus-newsgroup-processable)) (if (>= (length list) 1) (while list (let ((n (car list))) (do-exec n g args)) (setq list (cdr list))) (let ((n (gnus-summary-article-number))) (do-exec n g args)))))) (defun spambayes-refile-as-spam () "Retrain and refilter all process-marked messages as spam, then respool them" (interactive) (spambayes-retrain "-s -f") (gnus-summary-respool-article nil (gnus-group-method gnus-newsgroup-name))) (defun spambayes-refile-as-ham () "Retrain and refilter all process-marked messages as ham, then respool them" (interactive) (spambayes-retrain "-g -f") (gnus-summary-respool-article nil (gnus-group-method gnus-newsgroup-name))) From mhammond at users.sourceforge.net Thu Jan 23 04:17:37 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Thu Jan 23 07:17:40 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000 msgstore.py,1.37,1.38 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv32672 Modified Files: msgstore.py Log Message: A number of changes related to the folder selector dialog - code generally inspired by a couple of nice patches by Tony Meyer. Should be no visible changes, but under the covers: * worked out we were using short-term EIDs for folders, causing the exchange server to fail. So we are back to a faster MAPI version. Deleted the other variants - as I said in the comments, CVS is your friend . I was kind enough leave an indication of the last CVS revision with the old code though! * Only build the folder structure as the folder is expanded. Thus, the entire folder hierarchy is no longer walked when displaying the dialog. This speeds up the code considerable for exchange server users, where the public folder hierarchy is both huge, and never needed by SpamBayes. * We generally use the msgstore, rather than hitting MAPI directly, abstracting away some ugly code. Thus we are tied tighter to the spambayes manager object. * Drop all concepts of "default store" - all folder/message IDs are expected to be a tuple of (store_id, item_id). (This code was just a hangover to prevent CVS users of spambayes from needing to redefine all their folders when we first moved to the (store_id, item_id) scheme. Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** msgstore.py 14 Jan 2003 05:38:20 -0000 1.37 --- msgstore.py 23 Jan 2003 12:17:35 -0000 1.38 *************** *** 42,45 **** --- 42,48 ---- self.name = "" self.count = 0 + def GetParent(self): + # return a folder object with the parent, or None + raise NotImplementedError def GetMessageGenerator(self, folder): # Return a generator of MsgStoreMsg objects for the folder *************** *** 172,180 **** --- 175,188 ---- item_id = mapi.BinFromHex(item_id) if store_id is None: + # store_id=None was a "backwards compat" hack no longer + # need - it can go once we are *sure* we dont need it ;) + assert False, "We expect fully qualified IDs" store_id = self.default_store_bin_eid else: store_id = mapi.BinFromHex(store_id) return store_id, item_id + # See above - this branch can die (I think ;) assert type(item_id) in [type(''), type(u'')], "What kind of ID is '%r'?" % (item_id,) + assert False, "We expect fully qualified IDs" return self.default_store_bin_eid, mapi.BinFromHex(item_id) *************** *** 216,221 **** folder = self._OpenEntry(folder_id) table = folder.GetContentsTable(0) ! rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0) ! return MAPIMsgStoreFolder(self, folder_id, props[0][1], table.GetRowCount(0)) --- 224,231 ---- folder = self._OpenEntry(folder_id) table = folder.GetContentsTable(0) ! # Ensure we have a long-term ID. ! rc, props = folder.GetProps( (PR_ENTRYID, PR_DISPLAY_NAME_A), 0) ! folder_id = folder_id[0], props[0][1] ! return MAPIMsgStoreFolder(self, folder_id, props[1][1], table.GetRowCount(0)) *************** *** 275,278 **** --- 285,310 ---- return mapi.HexFromBin(self.id[0]), mapi.HexFromBin(self.id[1]) + def GetParent(self): + # return a folder object with the parent, or None + folder = self.msgstore._OpenEntry(self.id) + prop_ids = PR_PARENT_ENTRYID, + hr, data = folder.GetProps(prop_ids,0) + # Put parent ids together + parent_eid = data[0][1] + parent_id = self.id[0], parent_eid + if hr != 0 or \ + self.msgstore.session.CompareEntryIDs(parent_eid, self.id[1]): + # No parent EID, or EID same as ours. + return None + parent = self.msgstore._OpenEntry(parent_id) + # Finally get the display name. + hr, data = folder.GetProps((PR_DISPLAY_NAME_A,), 0) + name = data[0][1] + count = parent.GetContentsTable(0).GetRowCount(0) + return MAPIMsgStoreFolder(self.msgstore, parent_id, name, count) + + def OpenEntry(self, iid = None, flags = None): + return self.msgstore._OpenEntry(self.id, iid, flags) + def GetOutlookItem(self): hex_item_id = mapi.HexFromBin(self.id[1]) *************** *** 281,285 **** def GetMessageGenerator(self): ! folder = self.msgstore._OpenEntry(self.id) table = folder.GetContentsTable(0) # Limit ourselves to IPM.Note objects - ie, messages. --- 313,317 ---- def GetMessageGenerator(self): ! folder = self.OpenEntry() table = folder.GetContentsTable(0) # Limit ourselves to IPM.Note objects - ie, messages. From mhammond at users.sourceforge.net Thu Jan 23 04:17:37 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Thu Jan 23 07:17:44 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs FilterDialog.py,1.12,1.13 FolderSelector.py,1.10,1.11 TrainingDialog.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory sc8-pr-cvs1:/tmp/cvs-serv32672/dialogs Modified Files: FilterDialog.py FolderSelector.py TrainingDialog.py Log Message: A number of changes related to the folder selector dialog - code generally inspired by a couple of nice patches by Tony Meyer. Should be no visible changes, but under the covers: * worked out we were using short-term EIDs for folders, causing the exchange server to fail. So we are back to a faster MAPI version. Deleted the other variants - as I said in the comments, CVS is your friend . I was kind enough leave an indication of the last CVS revision with the old code though! * Only build the folder structure as the folder is expanded. Thus, the entire folder hierarchy is no longer walked when displaying the dialog. This speeds up the code considerable for exchange server users, where the public folder hierarchy is both huge, and never needed by SpamBayes. * We generally use the msgstore, rather than hitting MAPI directly, abstracting away some ugly code. Thus we are tied tighter to the spambayes manager object. * Drop all concepts of "default store" - all folder/message IDs are expected to be a tuple of (store_id, item_id). (This code was just a hangover to prevent CVS users of spambayes from needing to redefine all their folders when we first moved to the (store_id, item_id) scheme. Index: FilterDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FilterDialog.py,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** FilterDialog.py 23 Nov 2002 21:35:23 -0000 1.12 --- FilterDialog.py 23 Jan 2003 12:17:35 -0000 1.13 *************** *** 201,206 **** ids = [ids] single_select = not ids_are_list ! # d = FolderSelector.FolderSelector(self.mgr.message_store.session, ids, checkbox_state=None, single_select=single_select) ! d = FolderSelector.FolderSelector(self.mgr.outlook.Session, ids, checkbox_state=None, single_select=single_select) if d.DoModal()==win32con.IDOK: new_ids, include_sub = d.GetSelectedIDs() --- 201,205 ---- ids = [ids] single_select = not ids_are_list ! d = FolderSelector.FolderSelector(self.mgr, ids, checkbox_state=None, single_select=single_select) if d.DoModal()==win32con.IDOK: new_ids, include_sub = d.GetSelectedIDs() *************** *** 342,347 **** import FolderSelector filter = self.mgr.config.filter_now ! # d = FolderSelector.FolderSelector(self.mgr.message_store.session, filter.folder_ids,checkbox_state=filter.include_sub) ! d = FolderSelector.FolderSelector(self.mgr.outlook.Session, filter.folder_ids, checkbox_state=filter.include_sub) --- 341,345 ---- import FolderSelector filter = self.mgr.config.filter_now ! d = FolderSelector.FolderSelector(self.mgr, filter.folder_ids, checkbox_state=filter.include_sub) Index: FolderSelector.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FolderSelector.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** FolderSelector.py 15 Jan 2003 03:52:09 -0000 1.10 --- FolderSelector.py 23 Jan 2003 12:17:35 -0000 1.11 *************** *** 39,66 **** # rat's arse ). # So finally we have an Outlook object model version! ! ######################################################################### ! ## CDO version of a folder walker. ! ######################################################################### ! def _BuildFoldersCDO(folders): ! children = [] ! folder = folders.GetFirst() ! while folder: ! spec = FolderSpec(folder.ID, folder.Name.encode("mbcs", "replace")) ! spec.children = _BuildFoldersCDO(folder.Folders) ! children.append(spec) ! folder = folders.GetNext() ! return children ! def BuildFolderTreeCDO(session): ! infostores = session.InfoStores ! root = FolderSpec(None, "root") ! for i in range(infostores.Count): ! infostore = infostores[i+1] ! rootFolder = infostore.RootFolder ! folders = rootFolder.Folders ! spec = FolderSpec(rootFolder.ID, infostore.Name.encode("mbcs", "replace")) ! spec.children = _BuildFoldersCDO(folders) ! root.children.append(spec) ! return root ######################################################################### --- 39,48 ---- # rat's arse ). # So finally we have an Outlook object model version! ! # But then Tony Meyer came to the rescue - he noticed that we were ! # simply using short-term EID values for Exchange Folders - so now that ! # is solved, we are back to the Extended MAPI version. ! # These variants were deleted by MarkH - cvs is your friend :) ! # Last appeared in Rev 1.10 ######################################################################### *************** *** 69,76 **** from win32com.mapi import mapi from win32com.mapi.mapitags import * ! default_store_id = None ! ! def _BuildFoldersMAPI(msgstore, folder): # Get the hierarchy table for it. table = folder.GetHierarchyTable(0) --- 51,58 ---- from win32com.mapi import mapi from win32com.mapi.mapitags import * + import pythoncom ! def _BuildFoldersMAPI(manager, folder_id): ! folder = manager.message_store.GetFolder(folder_id).OpenEntry() # Get the hierarchy table for it. table = folder.GetHierarchyTable(0) *************** *** 80,102 **** PR_DISPLAY_NAME_A), None, None, 0) for (eid_tag, eid),(storeeid_tag, store_eid), (name_tag, name) in rows: ! folder_id = mapi.HexFromBin(store_eid), mapi.HexFromBin(eid) ! spec = FolderSpec(folder_id, name) ! child_folder = msgstore.OpenEntry(eid, None, mapi.MAPI_DEFERRED_ERRORS) ! spec.children = _BuildFoldersMAPI(msgstore, child_folder) ! children.append(spec) return children def BuildFolderTreeMAPI(session): - global default_store_id root = FolderSpec(None, "root") tab = session.GetMsgStoresTable(0) ! prop_tags = PR_ENTRYID, PR_DEFAULT_STORE, PR_DISPLAY_NAME_A rows = mapi.HrQueryAllRows(tab, prop_tags, None, None, 0) for row in rows: ! (eid_tag, eid), (is_def_tag, is_def), (name_tag, name) = row hex_eid = mapi.HexFromBin(eid) - if is_def: - default_store_id = hex_eid - msgstore = session.OpenMsgStore(0, eid, None, mapi.MDB_NO_MAIL | mapi.MAPI_DEFERRED_ERRORS) --- 62,94 ---- PR_DISPLAY_NAME_A), None, None, 0) for (eid_tag, eid),(storeeid_tag, store_eid), (name_tag, name) in rows: ! # Note the eid we get here is short-term - hence we must ! # re-fetch from the object itself (which is what our manager does, ! # so no need to do it explicitly - just believe folder.id over eid) ! temp_id = mapi.HexFromBin(store_eid), mapi.HexFromBin(eid) ! try: ! child_folder = manager.message_store.GetFolder(temp_id) ! except pythoncom.com_error: ! # Bad folder for some reason - ignore it. ! child_folder = None ! if child_folder is not None: ! spec = FolderSpec(child_folder.GetID(), name) ! # If we have no children at all, indicate ! # the item is not expandable. ! table = child_folder.OpenEntry().GetHierarchyTable(0) ! if table.GetRowCount(0) == 0: ! spec.children = [] ! else: ! spec.children = None # Flag as "not yet built" ! children.append(spec) return children def BuildFolderTreeMAPI(session): root = FolderSpec(None, "root") tab = session.GetMsgStoresTable(0) ! prop_tags = PR_ENTRYID, PR_DISPLAY_NAME_A rows = mapi.HrQueryAllRows(tab, prop_tags, None, None, 0) for row in rows: ! (eid_tag, eid), (name_tag, name) = row hex_eid = mapi.HexFromBin(eid) msgstore = session.OpenMsgStore(0, eid, None, mapi.MDB_NO_MAIL | mapi.MAPI_DEFERRED_ERRORS) *************** *** 106,136 **** folder_id = hex_eid, mapi.HexFromBin(subtree_eid) spec = FolderSpec(folder_id, name) ! spec.children = _BuildFoldersMAPI(msgstore, folder) root.children.append(spec) return root - ## - An Outlook object model version - import pythoncom - def _BuildFolderTreeOutlook(session, parent): - children = [] - for i in range(parent.Folders.Count): - folder = parent.Folders[i+1] - try: - spec = FolderSpec((folder.StoreID, folder.EntryID), - folder.Name.encode("mbcs", "replace")) - except pythoncom.error: - # Something strange with this folder - just ignore it - spec = None - if spec is not None: - if folder.Folders: - spec.children = _BuildFolderTreeOutlook(session, folder) - children.append(spec) - return children - - def BuildFolderTreeOutlook(session): - root = FolderSpec(None, "root") - root.children = _BuildFolderTreeOutlook(session, session) - return root - ######################################################################### ## The dialog itself --- 98,105 ---- folder_id = hex_eid, mapi.HexFromBin(subtree_eid) spec = FolderSpec(folder_id, name) ! spec.children = None root.children.append(spec) return root ######################################################################### ## The dialog itself *************** *** 174,178 **** ] ! def __init__ (self, mapi, selected_ids=None, single_select=False, checkbox_state=False, --- 143,147 ---- ] ! def __init__ (self, manager, selected_ids=None, single_select=False, checkbox_state=False, *************** *** 189,204 **** self.select_desc_noun_suffix = desc_noun_suffix self.selected_ids = selected_ids ! self.mapi = mapi self.checkbox_state = checkbox_state self.checkbox_text = checkbox_text or "Include &subfolders" def CompareIDs(self, id1, id2): ! if type(id1) != type(()): ! id1 = default_store_id, id1 ! if type(id2) != type(()): ! id2 = default_store_id, id2 ! return id1 == id2 ! # return self.mapi.CompareEntryIDs(mapi.BinFromHex(id1[0]), mapi.BinFromHex(id2[0])) and \ ! # self.mapi.CompareEntryIDs(mapi.BinFromHex(id1[1]), mapi.BinFromHex(id2[1])) def InIDs(self, id, ids): --- 158,170 ---- self.select_desc_noun_suffix = desc_noun_suffix self.selected_ids = selected_ids ! self.manager = manager self.checkbox_state = checkbox_state self.checkbox_text = checkbox_text or "Include &subfolders" def CompareIDs(self, id1, id2): ! # Compare the eid of the stores, then the objects ! CompareEntryIDs = self.manager.message_store.session.CompareEntryIDs ! return CompareEntryIDs(mapi.BinFromHex(id1[0]), mapi.BinFromHex(id2[0])) and \ ! CompareEntryIDs(mapi.BinFromHex(id1[1]), mapi.BinFromHex(id2[1])) def InIDs(self, id, ids): *************** *** 215,222 **** def _InsertSubFolders(self, hParent, folderSpec): - num_children_selected = 0 for child in folderSpec.children: text = child.name ! cItems = len(child.children) if cItems==0: bitmapCol = bitmapSel = 5 # blank doc --- 181,190 ---- def _InsertSubFolders(self, hParent, folderSpec): for child in folderSpec.children: text = child.name ! if child.children is None: # Need to build them! ! cItems = 1 # Anything > 0 will do ! else: ! cItems = len(child.children) if cItems==0: bitmapCol = bitmapSel = 5 # blank doc *************** *** 229,233 **** self.InIDs(child.folder_id, self.selected_ids)): state = INDEXTOSTATEIMAGEMASK(IIL_CHECKED) - num_children_selected += 1 else: state = INDEXTOSTATEIMAGEMASK(IIL_UNCHECKED) --- 197,200 ---- *************** *** 243,246 **** --- 210,219 ---- cItems, item_id)) + # If this folder is in the list of ones we need to expand + # to show pre-selected items, then force expand now. + if self.InIDs(child.folder_id, self.expand_ids): + self.list.Expand(hitem, commctrl.TVE_EXPAND) + # If single-select, and this is ours, select it + # (multi-select uses check-boxes, not selection) if (self.single_select and self.selected_ids and *************** *** 248,255 **** self.list.SelectItem(hitem) ! num_children_selected += self._InsertSubFolders(hitem, child) ! if num_children_selected and hParent: ! self.list.Expand(hParent, commctrl.TVE_EXPAND) ! return num_children_selected def _YieldChildren(self, h): --- 221,235 ---- self.list.SelectItem(hitem) ! def _DetermineFoldersToExpand(self): ! folders_to_expand = [] ! for folder_id in self.selected_ids: ! folder = self.manager.message_store.GetFolder(folder_id) ! while folder is not None: ! parent = folder.GetParent() ! if parent is not None and \ ! not self.InIDs(parent.GetID(), folders_to_expand): ! folders_to_expand.append(parent.GetID()) ! folder = parent ! return folders_to_expand def _YieldChildren(self, h): *************** *** 324,336 **** self.GetDlgItem(IDC_BUTTON_CLEARALL).ShowWindow(win32con.SW_HIDE) ! tree = BuildFolderTreeOutlook(self.mapi) ! # if hasattr(self.mapi, "_oleobj_"): # Dispatch COM object ! # # CDO ! # tree = BuildFolderTreeCDO(self.mapi) ! # else: ! # # Extended MAPI. ! # tree = BuildFolderTreeMAPI(self.mapi) self._InsertSubFolders(0, tree) ! self.selected_ids = [] # wipe this out while we are alive. self._UpdateStatus() --- 304,315 ---- self.GetDlgItem(IDC_BUTTON_CLEARALL).ShowWindow(win32con.SW_HIDE) ! # Extended MAPI version of the tree. ! # Build list of all ids to expand - ie, list includes all ! # selected folders, and all parents. ! self.expand_ids = self._DetermineFoldersToExpand() ! tree = BuildFolderTreeMAPI(self.manager.message_store.session) self._InsertSubFolders(0, tree) ! self.selected_ids = [] # Only use this while creating dialog. ! self.expand_ids = [] # Only use this while creating dialog. self._UpdateStatus() *************** *** 393,396 **** --- 372,383 ---- if idFrom != IDC_LIST_FOLDERS: return None action, itemOld, itemNew, pt = extra + if action == 1: return 0 # contracting, not expanding + + itemHandle = itemNew[0] + info = self.list.GetItem(itemHandle) + folderSpec = self.item_map[info[7]] + if folderSpec.children is None: + folderSpec.children = _BuildFoldersMAPI(self.manager, folderSpec.folder_id) + self._InsertSubFolders(itemHandle, folderSpec) return 0 *************** *** 411,440 **** return ret, self.GetDlgItem(IDC_BUTTON_SEARCHSUB).GetCheck() != 0 ! def TestWithCDO(): ! from win32com.client import Dispatch ! mapi = Dispatch("MAPI.Session") ! mapi.Logon("", "", False, False) ! ids = [u'0000000071C4408983B0B24F8863EE66A8F79AFF82800000'] ! d=FolderSelector(mapi, ids, single_select = False) ! d.DoModal() ! print d.GetSelectedIDs() ! ! def TestWithMAPI(): ! mapi.MAPIInitialize(None) ! logonFlags = mapi.MAPI_NO_MAIL | mapi.MAPI_EXTENDED | mapi.MAPI_USE_DEFAULT ! session = mapi.MAPILogonEx(0, None, None, logonFlags) ! ids = [u'0000000071C4408983B0B24F8863EE66A8F79AFF82800000'] ! d=FolderSelector(session, ids, single_select = False) d.DoModal() ! print d.GetSelectedIDs() ! ! def TestWithOutlook(): ! from win32com.client import Dispatch ! outlook = Dispatch("Outlook.Application") ! d=FolderSelector(outlook.Session, None, single_select = False) d.DoModal() - print d.GetSelectedIDs() - if __name__=='__main__': ! TestWithOutlook() --- 398,412 ---- return ret, self.GetDlgItem(IDC_BUTTON_SEARCHSUB).GetCheck() != 0 ! def Test(): ! import sys, os ! sys.path.append(os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), ".."))) ! import manager ! ids = [] ! d=FolderSelector(manager.GetManager(), ids, single_select = False) d.DoModal() ! ids, include_sub = d.GetSelectedIDs() ! d=FolderSelector(manager.GetManager(), ids, single_select = False) d.DoModal() if __name__=='__main__': ! Test() Index: TrainingDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/TrainingDialog.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** TrainingDialog.py 2 Nov 2002 12:28:38 -0000 1.8 --- TrainingDialog.py 23 Jan 2003 12:17:35 -0000 1.9 *************** *** 105,110 **** sub_attr = "ham_include_sub" include_sub = getattr(self.config, sub_attr) ! # d = FolderSelector.FolderSelector(self.mgr.message_store.session, l, checkbox_state=include_sub) ! d = FolderSelector.FolderSelector(self.mgr.outlook.Session, l, checkbox_state=include_sub) if d.DoModal()==win32con.IDOK: l[:], include_sub = d.GetSelectedIDs()[:] --- 105,109 ---- sub_attr = "ham_include_sub" include_sub = getattr(self.config, sub_attr) ! d = FolderSelector.FolderSelector(self.mgr, l, checkbox_state=include_sub) if d.DoModal()==win32con.IDOK: l[:], include_sub = d.GetSelectedIDs()[:] From montanaro at users.sourceforge.net Thu Jan 23 06:57:29 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Thu Jan 23 09:57:33 2003 Subject: [Spambayes-checkins] spambayes hammiefilter.py,1.11,1.12 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv14406 Modified Files: hammiefilter.py Log Message: needs to exit after processing -h Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** hammiefilter.py 22 Jan 2003 05:23:17 -0000 1.11 --- hammiefilter.py 23 Jan 2003 14:57:23 -0000 1.12 *************** *** 143,146 **** --- 143,147 ---- if opt in ('-h', '--help'): usage(0) + sys.exit(0) elif opt in ('-x', '--examples'): examples() From richiehindle at users.sourceforge.net Thu Jan 23 10:28:19 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Thu Jan 23 13:28:26 2003 Subject: [Spambayes-checkins] spambayes/spambayes PyMeldLite.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv29471 Modified Files: PyMeldLite.py Log Message: Uses expat rather than xmllib when running under 2.3 (expat is now included in the distribution, and xmllib is deprecated). Improved the bad-XML-characters code to write high characters as charrefs rather than replacing them with '?'. Index: PyMeldLite.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/PyMeldLite.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** PyMeldLite.py 22 Jan 2003 18:29:11 -0000 1.4 --- PyMeldLite.py 23 Jan 2003 18:28:15 -0000 1.5 *************** *** 195,209 **** # Entrian.Coverage: Pragma Stop ! try: ! # XXX Take this seriously before 2.4 comes out... ! import warnings ! warnings.filterwarnings(action='ignore', ! message='.*xmllib', ! category=DeprecationWarning) ! except ImportError: ! pass ! ! import re, xmllib ! try: True, False, bool --- 195,199 ---- # Entrian.Coverage: Pragma Stop ! import sys, re, string try: True, False, bool *************** *** 223,230 **** nonSelfClose = {'textarea': None} ! # Map characters not allowed in XML content to '?' ! import string ! badxml_chars = ''.join([chr(c) for c in range(0, 32) + range(128, 160) ! if c not in [9, 10, 13]]) badxml_map = string.maketrans(badxml_chars, '?' * len(badxml_chars)) --- 213,222 ---- nonSelfClose = {'textarea': None} ! # Map high characters to charrefs. ! def replaceHighCharacters(match): ! return "&#%d;" % ord(match.group(1)) ! ! # Map meaningless low characters to '?' ! badxml_chars = ''.join([chr(c) for c in range(0, 32) if c not in [9, 10, 13]]) badxml_map = string.maketrans(badxml_chars, '?' * len(badxml_chars)) *************** *** 359,454 **** ! class _TreeGenerator(xmllib.XMLParser): ! """An XML parser that generates a lightweight DOM tree. Call `feed()` ! with XML source, then `close()`, then `getTree()` will give you the ! tree's `_RootNode`: ! >>> g = _TreeGenerator() ! >>> g.feed("Stuff. ") ! >>> g.feed("More stuff.") ! >>> g.close() ! >>> tree = g.getTree() ! >>> print tree.toText() ! Stuff. More stuff. ! """ ! def __init__(self): ! xmllib.XMLParser.__init__(self, translate_attribute_references=False) ! self.entitydefs = {} # entitydefs is an xmllib.XMLParser attribute. ! self._tree = _RootNode() ! self._currentNode = self._tree ! self._pendingText = [] ! def getTree(self): ! """Returns the generated tree; call `feed()` then `close()` first.""" ! return self._tree ! def _collapsePendingText(self): ! """Text (any content that isn't an open/close element) is built up ! in `self._pendingText` until an open/close element is seen, at which ! point it gets collapsed into a `_TextNode`.""" ! data = ''.join(self._pendingText) ! self._currentNode.children.append(_TextNode(data)) ! self._pendingText = [] ! def handle_xml(self, encoding, standalone): ! xml = '' ! self._pendingText.append(xml) ! def handle_doctype(self, tag, pubid, syslit, data): ! doctype = '' % data ! else: ! doctype += '>' ! self._pendingText.append(doctype) ! def handle_comment(self, data): ! self._pendingText.append('' % data) ! def handle_proc(self, name, data): ! self._pendingText.append('' % (name, data.strip())) ! def handle_data(self, data): ! self._pendingText.append(data) ! def handle_charref(self, ref): ! self._pendingText.append('&#%s;' % ref) ! unknown_charref = handle_charref ! def handle_entityref(self, ref): ! self._pendingText.append('&%s;' % ref) ! unknown_entityref = handle_entityref ! def handle_cdata(self, data): ! if self._pendingText: ! self._collapsePendingText() ! self._pendingText.append('' % data) ! def unknown_starttag(self, tag, attributes): ! if self._pendingText: ! self._collapsePendingText() ! newNode = _ElementNode(self._currentNode, tag, attributes) ! self._currentNode.children.append(newNode) ! self._currentNode = newNode ! def unknown_endtag(self, tag): ! if self._pendingText: ! self._collapsePendingText() ! self._currentNode = self._currentNode.parent --- 351,540 ---- ! # For XML parsing we use xmllib in versions prior to 2.3, because we can't ! # be sure that expat will be there, or that it will be a decent version. ! # We use expat in versions 2.3 and above, because we can be sure it will ! # be there and xmllib is deprecated from 2.3. ! # The slightly odd Entrian.Coverage pragmas in this section make sure that ! # whichever branch is taken, we get code coverage for that branch and no ! # coverage failures for the other. ! if sys.hexversion >> 16 < 0x203: ! # Entrian.Coverage: Pragma Stop ! import xmllib ! class _TreeGenerator(xmllib.XMLParser): ! # Entrian.Coverage: Pragma Start ! """An XML parser that generates a lightweight DOM tree. Call `feed()` ! with XML source, then `close()`, then `getTree()` will give you the ! tree's `_RootNode`: ! >>> g = _TreeGenerator() ! >>> g.feed("Stuff. ") ! >>> g.feed("More stuff.") ! >>> g.close() ! >>> tree = g.getTree() ! >>> print tree.toText() ! Stuff. More stuff. ! """ ! def __init__(self): ! xmllib.XMLParser.__init__(self, ! translate_attribute_references=False) ! self.entitydefs = {} # This is an xmllib.XMLParser attribute. ! self._tree = _RootNode() ! self._currentNode = self._tree ! self._pendingText = [] ! def getTree(self): ! """Returns the generated tree; call `feed` then `close` first.""" ! return self._tree ! def _collapsePendingText(self): ! """Text (any content that isn't an open/close element) is built up ! in `self._pendingText` until an open/close element is seen, at ! which point it gets collapsed into a `_TextNode`.""" ! data = ''.join(self._pendingText) ! self._currentNode.children.append(_TextNode(data)) ! self._pendingText = [] ! def handle_xml(self, encoding, standalone): ! xml = '' ! self._pendingText.append(xml) ! def handle_doctype(self, tag, pubid, syslit, data): ! doctype = '' % data ! else: ! doctype += '>' ! self._pendingText.append(doctype) ! def handle_comment(self, data): ! self._pendingText.append('' % data) ! def handle_proc(self, name, data): ! self._pendingText.append('' % (name, data.strip())) ! def handle_data(self, data): ! self._pendingText.append(data) ! def handle_charref(self, ref): ! self._pendingText.append('&#%s;' % ref) ! unknown_charref = handle_charref ! def handle_entityref(self, ref): ! self._pendingText.append('&%s;' % ref) ! unknown_entityref = handle_entityref ! def handle_cdata(self, data): ! if self._pendingText: ! self._collapsePendingText() ! self._pendingText.append('' % data) ! def unknown_starttag(self, tag, attributes): ! if self._pendingText: ! self._collapsePendingText() ! newNode = _ElementNode(self._currentNode, tag, attributes) ! self._currentNode.children.append(newNode) ! self._currentNode = newNode ! ! def unknown_endtag(self, tag): ! if self._pendingText: ! self._collapsePendingText() ! self._currentNode = self._currentNode.parent ! ! else: ! # Entrian.Coverage: Pragma Stop ! import xml.parsers.expat ! class _TreeGenerator: ! # Entrian.Coverage: Pragma Start ! """An XML parser that generates a lightweight DOM tree. Call `feed()` ! with XML source, then `close()`, then `getTree()` will give you the ! tree's `_RootNode`: ! ! >>> g = _TreeGenerator() ! >>> g.feed("Stuff. ") ! >>> g.feed("More stuff.") ! >>> g.close() ! >>> tree = g.getTree() ! >>> print tree.toText() ! Stuff. More stuff. ! """ ! ! def __init__(self): ! self._tree = _RootNode() ! self._currentNode = self._tree ! self._pendingText = [] ! self._parser = xml.parsers.expat.ParserCreate() ! self._parser.buffer_text = True ! self._parser.DefaultHandler = self.DefaultHandler ! self._parser.StartElementHandler = self.StartElementHandler ! self._parser.EndElementHandler = self.EndElementHandler ! ! # All entities and charrefs, like • and  , are considered ! # valid - who are we to argue? Expat thinks it knows better, so we ! # fool it here. ! def _mungeEntities(self, data): ! return re.sub(r'&(\w+);', r':PyMeldEntity:\1:', data) ! ! def _unmungeEntities(self, data): ! return re.sub(r':PyMeldEntity:(\w+):', r'&\1;', data) ! ! def feed(self, data): ! """Call this with XML content to be parsed.""" ! data = self._mungeEntities(data) ! self._parser.Parse(data) ! ! def close(self): ! """Call this when you've passed all your XML content to `feed`.""" ! self._parser.Parse("", True) ! ! def getTree(self): ! """Returns the generated tree; call `feed` then `close` first.""" ! return self._tree ! ! def _collapsePendingText(self): ! """Text (any content that isn't an open/close element) is built up ! in `self._pendingText` until an open/close element is seen, at ! which point it gets collapsed into a `_TextNode`.""" ! ! data = ''.join(self._pendingText) ! data = self._unmungeEntities(data) ! self._currentNode.children.append(_TextNode(data)) ! self._pendingText = [] ! ! def DefaultHandler(self, data): ! """Expat handler.""" ! self._pendingText.append(str(data)) ! ! def StartElementHandler(self, tag, attributes): ! """Expat handler.""" ! if self._pendingText: ! self._collapsePendingText() ! newAttributes = {} ! for name, value in attributes.iteritems(): ! newAttributes[str(name)] = self._unmungeEntities(str(value)) ! newNode = _ElementNode(self._currentNode, str(tag), newAttributes) ! self._currentNode.children.append(newNode) ! self._currentNode = newNode ! ! def EndElementHandler(self, tag): ! """Expat handler.""" ! if self._pendingText: ! self._collapsePendingText() ! self._currentNode = self._currentNode.parent *************** *** 480,485 **** source[match.end(1):] ! # Map characters not allowed in XML content to '?' source = source.translate(badxml_map) # Parse the XML and generate the tree. --- 566,572 ---- source[match.end(1):] ! # Map characters not allowed in XML content to sensible things. source = source.translate(badxml_map) + source = re.sub('([\x80-\xff])', replaceHighCharacters, source) # Parse the XML and generate the tree. *************** *** 889,897 **** 'XML proc': """ ! >>> print Meld(''' ... ... - ... (...etc...) ... ]> ... Stuff''') --- 976,983 ---- 'XML proc': """ ! >>> print Meld(''' ... ... ... ]> ... Stuff''') *************** *** 900,904 **** - (...etc...) ]> Stuff --- 986,989 ---- *************** *** 913,923 **** 'entities and charrefs': """ >>> page = Meld('''• This "and that"... ! ... x''') >>> print page.s.title "Quoted" & Not - >>> page.s.title = page.s.title # Accept liberally, produce strictly. - >>> print page - • This "and that"... - x >>> page.s.title = page.s.title + " <>" >>> print page.s.title --- 998,1004 ---- 'entities and charrefs': """ >>> page = Meld('''• This "and that"... ! ... x''') >>> print page.s.title "Quoted" & Not >>> page.s.title = page.s.title + " <>" >>> print page.s.title *************** *** 1068,1076 **** 'bad XML characters': """ >>> page = Meld(''' ! ... Valentines Day Special \x96 2 bikinis for the price of one ... ''') # No exception. >>> print page ! Valentines Day Special ? 2 bikinis for the price of one """ --- 1149,1157 ---- 'bad XML characters': """ >>> page = Meld(''' ! ... Valentines Day Special \x96 2 bikinis for the price of one \x01 ... ''') # No exception. >>> print page ! Valentines Day Special – 2 bikinis for the price of one ? """ From montanaro at users.sourceforge.net Thu Jan 23 12:15:34 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Thu Jan 23 15:15:38 2003 Subject: [Spambayes-checkins] spambayes hammiefilter.py,1.12,1.13 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv17108 Modified Files: hammiefilter.py Log Message: backing out the addition of sys.exit(0) to the -h branch. Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** hammiefilter.py 23 Jan 2003 14:57:23 -0000 1.12 --- hammiefilter.py 23 Jan 2003 20:15:31 -0000 1.13 *************** *** 143,147 **** if opt in ('-h', '--help'): usage(0) - sys.exit(0) elif opt in ('-x', '--examples'): examples() --- 143,146 ---- From mhammond at users.sourceforge.net Fri Jan 24 04:43:46 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Fri Jan 24 07:43:50 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.44,1.45 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv26159 Modified Files: addin.py Log Message: Add comment. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** addin.py 10 Jan 2003 00:34:57 -0000 1.44 --- addin.py 24 Jan 2003 12:43:43 -0000 1.45 *************** *** 464,467 **** --- 464,469 ---- # there is no clean way to remove the buttons. Do we even care? assert item_attrs.has_key('Tag'), "Need a 'Tag' attribute!" + # Note we search *all* command bars here for the tag, only + # adding to the specified bar if not found. item = self.CommandBars.FindControl( Type = control_type, From richiehindle at users.sourceforge.net Fri Jan 24 12:05:09 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 15:06:32 2003 Subject: [Spambayes-checkins] spambayes OptionConfig.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv13796 Modified Files: OptionConfig.py Log Message: Added True/False code for 2.2. Index: OptionConfig.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/OptionConfig.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** OptionConfig.py 17 Jan 2003 20:21:02 -0000 1.4 --- OptionConfig.py 24 Jan 2003 20:05:05 -0000 1.5 *************** *** 31,34 **** --- 31,39 ---- # Blame for bugs caused by using Dibbler: Richie Hindle + try: + True, False + except NameError: + # Maintain compatibility with Python 2.2 + True, False = 1, 0 from spambayes import Dibbler, PyMeldLite From richiehindle at users.sourceforge.net Fri Jan 24 12:34:08 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 15:34:13 2003 Subject: [Spambayes-checkins] spambayes/contrib SmarterHTTPServer.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/contrib In directory sc8-pr-cvs1:/tmp/cvs-serv31087 Added Files: SmarterHTTPServer.py Log Message: Moved SmarterHTTPServer.py, which used to be used by the old OptionConfig.py, into contrib. Nothing currently uses it. Moving it hasn't destroyed any checkin history to speak of. --- NEW FILE: SmarterHTTPServer.py --- """Smarter HTTP Server. This module builds on SimpleHTTPServer, adding 'methlet' invokation by handling urls with a file extension of .methlet. In this instance, the so-called filename actually names a method on the handler, which is invoked with a single parameter, a dictionary of the url's parsed query string. This class is intended to be subclassed, with subclasses adding the appropriate methlet methods for the application being served. """ __version__ = "0.6" __all__ = ["SmarterHTTPRequestHandler"] import os import posixpath import BaseHTTPServer import SimpleHTTPServer import urllib import cgi import shutil import mimetypes import re try: import cStringIO as StringIO except ImportError: import StringIO class SmarterHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): """Smarter HTTP request handler based on SimpleHTTPRequestHandler. Adds GET with parameters, which calls a method. """ server_version = "SmarterHTTP/" + __version__ def send_head(self): """Common code for GET and HEAD commands. This sends the response code and MIME headers. Return value is either a file object (which has to be copied to the outputfile by the caller unless the command was HEAD, and must be closed by the caller under all circumstances), or None, in which case the caller has nothing further to do. """ path, parms = self.translate_path(self.path) f = None if os.path.isdir(path): if hasattr(self, 'homepage'): path = 'homepage.methlet' else: for index in "index.html", "index.htm": index = os.path.join(path, index) if os.path.exists(index): path = index break else: return self.list_directory(path) ctype = self.guess_type(path) if ctype != 'application/method': if ctype.startswith('text/'): mode = 'r' else: mode = 'rb' try: f = open(path, mode) except IOError: self.send_error(404, "File not found") return None else: self.send_response(200) self.send_header("Content-type", ctype) self.end_headers() else: head, tail = os.path.split(path) methname = tail.split('.')[0] pdict = {} if parms: pdict = cgi.parse_qs(parms, False) # ctype application/method methlets (invented here) may # send whatever headers they like. However, the server has # already sent the 200 response, so Location: headers are # not meaningful. Also, the server will always send # Content-type: text/html, so the methlets should not send # anything incompatible with text/html type. Methlets should # not invoke end_headers(). if hasattr(self, methname): self.send_response(200) retstr = getattr(self, methname)(pdict) f = StringIO.StringIO(retstr) self.send_header("Content-type", 'text/html') self.end_headers() else: self.send_error(404, "File not found") return None return f def translate_path(self, url): """Translate a /-separated PATH to the local filename syntax. Components that mean special things to the local file system (e.g. drive or directory names) are ignored. (XXX They should probably be diagnosed.) """ parmre = re.compile(r'^(.*)[\?](.*)$') match = parmre.search(url) if match: path = match.group(1) parms = match.group(2) else: path = url parms = None path = posixpath.normpath(urllib.unquote(path)) words = path.split('/') words = filter(None, words) path = os.getcwd() for word in words: drive, word = os.path.splitdrive(word) head, word = os.path.split(word) if word in (os.curdir, os.pardir): continue path = os.path.join(path, word) return (path, parms) def guess_type(self, path): """Guess the type of a file. Argument is a PATH (a filename). Return value is a string of the form type/subtype, usable for a MIME Content-type header. The default implementation looks the file's extension up in the table self.extensions_map, using text/plain as a default; however it would be permissible (if slow) to look inside the data to make a better guess. """ base, ext = posixpath.splitext(path) if self.extensions_map.has_key(ext): return self.extensions_map[ext] ext = ext.lower() if self.extensions_map.has_key(ext): return self.extensions_map[ext] else: return self.extensions_map[''] extensions_map = mimetypes.types_map.copy() extensions_map.update({ '': 'application/octet-stream', # Default '.py': 'text/plain', '.c': 'text/plain', '.h': 'text/plain', '.methlet': 'application/method', }) def test(HandlerClass = SmarterHTTPRequestHandler, ServerClass = BaseHTTPServer.HTTPServer): BaseHTTPServer.test(HandlerClass, ServerClass) if __name__ == '__main__': test() From richiehindle at users.sourceforge.net Fri Jan 24 12:34:35 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 15:34:37 2003 Subject: [Spambayes-checkins] spambayes SmarterHTTPServer.py,1.2,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv31301 Removed Files: SmarterHTTPServer.py Log Message: Moved SmarterHTTPServer.py, which used to be used by the old OptionConfig.py, into contrib. Nothing currently uses it. Moving it hasn't destroyed any checkin history to speak of. --- SmarterHTTPServer.py DELETED --- From richiehindle at users.sourceforge.net Fri Jan 24 12:43:07 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 15:43:11 2003 Subject: [Spambayes-checkins] spambayes MANIFEST.in,1.1,1.2 setup.py,1.13,1.14 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv2106 Modified Files: MANIFEST.in setup.py Log Message: Install the spambayes.resources package. Index: MANIFEST.in =================================================================== RCS file: /cvsroot/spambayes/spambayes/MANIFEST.in,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** MANIFEST.in 17 Jan 2003 06:46:56 -0000 1.1 --- MANIFEST.in 24 Jan 2003 20:43:05 -0000 1.2 *************** *** 1,2 **** --- 1,3 ---- + recursive-include spambayes.resources *.py *.html *.gif *.psp recursive-include spambayes *.py recursive-include pspam *.py *.txt *.ini *.sh *************** *** 5,8 **** recursive-include utilities *.py *.txt recursive-include testtools *.py *.txt ! include *.txt *.py *.gif --- 6,9 ---- recursive-include utilities *.py *.txt recursive-include testtools *.py *.txt ! include *.txt *.py *.gif Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** setup.py 17 Jan 2003 06:45:36 -0000 1.13 --- setup.py 24 Jan 2003 20:43:05 -0000 1.14 *************** *** 1,5 **** #!/usr/bin/env python ! import sys if sys.version < '2.2': print "Error: Python version too old. You need at least Python 2.2 to use this package." --- 1,5 ---- #!/usr/bin/env python ! import sys if sys.version < '2.2': print "Error: Python version too old. You need at least Python 2.2 to use this package." *************** *** 7,11 **** sys.exit(0) ! # Install from distutils.core import setup --- 7,11 ---- sys.exit(0) ! # Install from distutils.core import setup *************** *** 29,35 **** version = __version__, description = "Spam classification system", ! author = "the spambayes project", author_email = "spambayes@python.org", ! url = "http://spambayes.sourceforge.net", scripts=['unheader.py', 'hammie.py', --- 29,35 ---- version = __version__, description = "Spam classification system", ! author = "the spambayes project", author_email = "spambayes@python.org", ! url = "http://spambayes.sourceforge.net", scripts=['unheader.py', 'hammie.py', *************** *** 41,45 **** 'proxytee.py', ], ! packages = [ 'spambayes', ], classifiers = [ 'Development Status :: 4 - Beta', --- 41,48 ---- 'proxytee.py', ], ! packages = [ ! 'spambayes', ! 'spambayes.resources', ! ], classifiers = [ 'Development Status :: 4 - Beta', From richiehindle at users.sourceforge.net Fri Jan 24 12:51:34 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 15:51:39 2003 Subject: [Spambayes-checkins] spambayes/spambayes OptionConfig.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv5439 Added Files: OptionConfig.py Log Message: Moved OptionConfig into the spambayes package - there's no longer any need to run it standalone now that it's a part of the main web UI. --- NEW FILE: OptionConfig.py --- """Options Configurator Classes: OptionsConfigurator - changes select values in Options.py Abstract: This module implements a browser based Spambayes option file configuration utility. Users may use the pages in this application to customize the options in the bayescustomize.ini file. This does not support the BAYESCUSTOMIZE environment variable. Is this even used anywhere? By default, this module forms a part of the web user interface provided by pop3proxy.py. You can also run it standalone, but only for historical reasons. To do this, just invoke OptionConfig.py The port number is the port the http server will listen on, and defaults to 8000. Your web browser should launch automatically; if it doesn't, then point it to http://locahost:8000 (or whatever port you chose). To Do: o Suggestions? """ # This module is part of the spambayes project, which is Copyright 2002 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Tim Stone " # Blame for bugs caused by using Dibbler: Richie Hindle try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 from spambayes import Dibbler, PyMeldLite from spambayes.Options import options import re import os, sys import ConfigParser import copy IMAGES = ('helmet', 'config', 'status') # This control dictionary maps http request parameters and template fields # to ConfigParser sections and options. The key matches both the input # field that corresponds to a section/option, and also the HTML template # variable that is used to display the value of that section/option. parm_ini_map = \ {'hamcutoff': ('Categorization', 'ham_cutoff'), 'spamcutoff': ('Categorization', 'spam_cutoff'), 'dbname': ('pop3proxy', 'pop3proxy_persistent_storage_file'), 'headername': ('Hammie', 'hammie_header_name'), 'spamstring': ('Hammie', 'header_spam_string'), 'hamstring': ('Hammie', 'header_ham_string'), 'unsurestring': ('Hammie', 'header_unsure_string'), 'p3servers': ('pop3proxy', 'pop3proxy_servers'), 'p3ports': ('pop3proxy', 'pop3proxy_ports'), } # "Restore defaults" ignores these, because it would be pointlessly # destructive - they default to being empty, so you gain nothing by # restoring them. noRestore = ('pop3proxy_servers', 'pop3proxy_ports') # This governs the order in which the options appear on the configurator # page, and the headings and help text that are used. page_layout = \ ( ("POP3 Options", ( ("p3servers", "Servers", """The Spambayes POP3 proxy intercepts incoming email and classifies it before sending it on to your email client. You need to specify which POP3 server(s) you wish it to intercept - a POP3 server address typically looks like "pop3.myisp.net". If you use more than one server, simply separate their names with commas. You can get these server names from your existing email configuration, or from your ISP or system administrator. If you are using Web-based email, you can't use the Spambayes POP3 proxy (sorry!). In your email client's configuration, where you would normally put your POP3 server address, you should now put the address of the machine running Spambayes."""), ("p3ports", "Ports", """Each POP3 server that is being monitored must be assigned to a 'port' in the Spambayes POP3 proxy. This port must be different for each monitored server, and there MUST be a port for each monitored server. Again, you need to configure your email client to use this port. If there are multiple servers, you must specify the same number of ports as servers, separated by commas."""), )), ("Statistics Options", ( ("hamcutoff", "Ham Cutoff", """Spambayes gives each email message a spam probability between 0 and 1. Emails below the Ham Cutoff probability are classified as Ham. Larger values will result in more messages being classified as ham, but with less certainty that all of them actually are ham. This value should be between 0 and 1, and should be smaller than the Spam Cutoff."""), ("spamcutoff", "Spam Cutoff", """Emails with a spam probability above the Spam Cutoff are classified as Spam - just like the Ham Cutoff but at the other end of the scale. Messages that fall between the two values are classified as Unsure."""), ("dbname", "Database filename", """Spambayes builds a database of information that it gathers from incoming emails and from you, the user, to get better and better at classifying your email. This option specifies the name of the database file. If you don't give a full pathname, the name will be taken to be relative to the current working directory."""), )), ) # Tim Stone's original OptionConfig.py had these options as well, but I # (Richie) suggested that they were overkill, and Tim agreed. We can always # put them back if people want them. _insertedHeaderOptions = ''' ("Inserted Header Options", ( ("headername", "Header Name", """Spambayes classifies each message by inserting a new header into the message. This header can then be used by your email client (provided your client supports filtering) to move spam into a separate folder (recommended), delete it (not recommended), etc. This option specifies the name of the header that Spambayes inserts. The default value should work just fine, but you may change it to anything that you wish."""), ("spamstring", "Spam Designation", """The header that Spambayes inserts into each email has a name, (Header Name, above), and a value. If the classifier determines that this email is probably spam, it places a header named as above with a value as specified by this string. The default value should work just fine, but you may change it to anything that you wish."""), ("hamstring", "Ham Designation", """As for Spam Designation above, but for emails classified as Ham."""), ("unsurestring", "Unsure Designation", """As for Spam/Ham Designation above, but for emails which the classifer wasn't sure about (ie. the spam probability fell between the Ham and Spam Cutoffs). Emails that have this classification should always be the subject of training."""), )), ''' OK_MESSAGE = "%s. Return Home." PIMapSect = 0 PIMapOpt = 1 class OptionsConfigurator(Dibbler.HTTPPlugin): def __init__(self, proxyUI=None): Dibbler.HTTPPlugin.__init__(self) # Store the proxy UI; this won't be given when we're standalone. self.proxyUI = proxyUI # Load up the necessary resources: ui.html and the GIFs. from pop3proxy import readUIResources htmlSource, self._images = readUIResources() self.html = PyMeldLite.Meld(htmlSource) # Adjust the HTML according to whether we're running standalone or as # a part of the proxy. if not self.proxyUI: self.html.productName = "Spambayes Options Configurator" self.html.footerHome = "Spambayes Options Configurator" self.html.shutdownButton.value = "Shutdown Configurator" else: # "Save and Shutdown" is confusing here - it means "Save database" # but that's not clear. self.html.shutdownTableCell = " " def onConfig(self): # start with the options config file, add bayescustomize.ini to it bcini = ConfigParser.ConfigParser() # this is a pain... for sect in options._config.sections(): for opt in options._config.options(sect): try: bcini.set(sect, opt, options._config.get(sect, opt)) except ConfigParser.NoSectionError: bcini.add_section(sect) bcini.set(sect, opt, options._config.get(sect, opt)) bcini.read('bayescustomize.ini') # Start with an empty config form then add the sections. html = self.html.clone() html.mainContent = self.html.configForm.clone() html.mainContent.configFormContent = "" # Loop though the sections in the `page_layout` structure above. for sectionHeading, values in page_layout: # Start the yellow-headed box for this section. section = self.html.headedBox.clone() section.heading = sectionHeading del section.iconCell # Get a clone of the config table and a clone of each example row, # then blank out the example rows to make way for the real ones. configTable = self.html.configTable.clone() configRow1 = configTable.configRow1.clone() configRow2 = configTable.configRow2.clone() blankRow = configTable.blankRow.clone() del configTable.configRow1 del configTable.configRow2 del configTable.blankRow # Now within this section, loop though the values, adding a # labelled input control for each one, populated with the current # value. isFirstRow = True for name, label, unusedHelp in values: newConfigRow1 = configRow1.clone() newConfigRow2 = configRow2.clone() currentValue = bcini.get(parm_ini_map[name][PIMapSect], \ parm_ini_map[name][PIMapOpt]) # If this is the first row, insert the help text in a cell # with a `rowspan` that covers all the rows. if isFirstRow: entries = [] for unusedName, topic, help in values: entries.append("

    %s: %s

    " % (topic, help)) newConfigRow1.helpSpacer = ' ' * 10 newConfigRow1.helpCell = '\n'.join(entries) else: del newConfigRow1.helpSpacer del newConfigRow1.helpCell # Populate the rows with the details and add them to the table. newConfigRow1.label = label newConfigRow1.input.name = name newConfigRow1.input.value = currentValue newConfigRow2.currentValue = currentValue configTable += newConfigRow1 + newConfigRow2 + blankRow isFirstRow = False # Finish off the box for this section and add it to the form. section.boxContent = configTable html.configFormContent += section # Customise the page according to whether we're standalone or a proxy. if self.proxyUI: html.title = 'Home > Configure' html.pagename = '> Configure' else: html.title = 'Home' del html.homelink html.pagename = 'Home' self.writeOKHeaders('text/html') self.write(html) # Implement `onHome` for the standalone version. In the POP3 proxy, the # proxy UI's `onHome` will take precedence over this one. onHome = onConfig def onChangeopts(self, **parms): html = self.html.clone() html.mainContent = self.html.headedBox.clone() errmsg = editInput(parms) if errmsg != '': html.mainContent.heading = "Errors Detected" html.mainContent.boxContent = errmsg html.title = 'Home > Error' html.pagename = '> Error' self.writeOKHeaders('text/html') self.write(html) return updateIniFile(parms) html.mainContent.heading = "Options Changed" if self.proxyUI: html.mainContent.boxContent = OK_MESSAGE % "Options changed" self.proxyUI.reReadOptions() else: html.mainContent.boxContent = """The options changes you've made have been recorded. You will need to restart any Spambayes processes you have running, such as the pop3proxy, in order for your changes to take effect. When you return to the Options Configuration homepage, you may need to refresh the page to see the changes you have made.""" html.title = 'Home > Options Changed' html.pagename = '> Options Changed' self.writeOKHeaders('text/html') self.write(html) def onRestoredefaults(self, how): restoreIniDefaults() html = self.html.clone() html.mainContent = self.html.headedBox.clone() html.mainContent.heading = "Option Defaults Restored" if self.proxyUI: html.mainContent.boxContent = OK_MESSAGE % "Defaults restored" self.proxyUI.reReadOptions() else: html.mainContent.boxContent = """All options have been reverted to their default values. You will need to restart any Spambayes processes you have running, such as the pop3proxy, in order for your changes to take effect. When you return to the Options Configuration homepage, you may need to refresh the page to see the changes you have made.""" html.title = 'Home > Defaults Restored' html.pagename = '> Defaults Restored' self.writeOKHeaders('text/html') self.write(html) def onSave(self, how): # Really 'shutdown'; this is the button in the footer, not on the # form. Again, the proxy UI's `onSave` will override this one when # we're running as part of the proxy. html = self.html.clone() del html.helmet del html.homelink html.shutdownTableCell = " " html.mainContent = self.html.shutdownMessage html.title = 'Home > Shutdown' html.pagename = 'Shutdown' self.writeOKHeaders('text/html') self.write(html) self.close() sys.exit() def _writeImage(self, image): self.writeOKHeaders('image/gif') self.write(self._images[image]) # If you are easily offended, look away now... for imageName in IMAGES: exec "def %s(self): self._writeImage('%s')" % \ ("on%sGif" % imageName.capitalize(), imageName) def editInput(parms): errmsg = '' # edit numericity of hamcutoff and spamcutoff try: hco = parms['hamcutoff'] except KeyError: hco = options.ham_cutoff try: sco = parms['spamcutoff'] except KeyError: sco = options.spam_cutoff errmsg = '' try: hco = float(hco) except ValueError: errmsg += '
  • Ham cutoff must be a number, between 0 and 1
  • \n' try: sco = float(sco) except ValueError: errmsg += '
  • Spam cutoff must be a number, \ between 0 and 1
  • \n' # edit 0 <= hamcutoff < spamcutoff <= 1 if hco < 0 or hco > 1: errmsg += '
  • Ham cutoff must be between 0 and 1
  • \n' if sco < 0 or sco > 1: errmsg += '
  • Spam cutoff must be between 0 and 1
  • \n' if not hco < sco: errmsg += '
  • Ham cutoff must be less than Spam cutoff
  • \n' # edit for equal number of pop3servers and ports try: slist = parms['p3servers'].split(',') except KeyError: slist = options.pop3proxy_servers.split(',') try: plist = parms['p3ports'].split(',') except KeyError: plist = options.pop3proxy_ports.split(',') # edit for duplicate ports if len(slist) != len(plist): errmsg += '
  • The number of ports specified must match the \ number of servers specified
  • \n' plist.sort() for p in range(len(plist)-1): try: if plist[p] == plist[p+1]: errmsg += '
  • All port numbers must be unique
  • ' break except IndexError: pass return errmsg def updateIniFile(parms): # assumes bayescustomize.ini is in this process' working directory inipath = os.path.abspath('bayescustomize.ini') bcini = ConfigParser.ConfigParser() bcini.read(inipath) for httpParm in parm_ini_map: map = parm_ini_map[httpParm] sect = map[PIMapSect] opt = map[PIMapOpt] try: val = parms[httpParm] except KeyError: continue try: bcini.add_section(sect) except ConfigParser.DuplicateSectionError: pass bcini.set(sect, opt, val) o = open(inipath, 'wt') bcini.write(o) o.close() def restoreIniDefaults(): # assumes bayescustomize.ini is in this process' working directory inipath = os.path.abspath('bayescustomize.ini') bcini = ConfigParser.ConfigParser() bcini.read(inipath) # Only restore the settings that appear on the form. for section, option in parm_ini_map.values(): if option not in noRestore: try: bcini.remove_option(section, option) except ConfigParser.NoSectionError: pass # Already missing. o = open(inipath, 'wt') bcini.write(o) o.close() # # Running this standalone is no longer required, and doesn't work out of # the box. The code's here for reference only. # def run(port): httpServer = Dibbler.HTTPServer(port) httpServer.register(OptionsConfigurator()) Dibbler.run(launchBrowser=True) if __name__ == '__main__': if len(sys.argv) > 1: port = int(sys.argv[1]) else: port = 8000 run(port) From richiehindle at users.sourceforge.net Fri Jan 24 12:53:32 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 15:53:42 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.39,1.40 OptionConfig.py,1.5,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv6225 Modified Files: pop3proxy.py Removed Files: OptionConfig.py Log Message: Moved OptionConfig into the spambayes package - there's no longer any need to run it standalone now that it's a part of the main web UI. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** pop3proxy.py 22 Jan 2003 18:33:35 -0000 1.39 --- pop3proxy.py 24 Jan 2003 20:53:29 -0000 1.40 *************** *** 150,154 **** from spambayes.FileCorpus import FileMessageFactory, GzipFileMessageFactory from email.Iterators import typed_subpart_iterator ! from OptionConfig import OptionsConfigurator from spambayes.Options import options --- 150,154 ---- from spambayes.FileCorpus import FileMessageFactory, GzipFileMessageFactory from email.Iterators import typed_subpart_iterator ! from spambayes.OptionConfig import OptionsConfigurator from spambayes.Options import options --- OptionConfig.py DELETED --- From richiehindle at users.sourceforge.net Fri Jan 24 13:39:40 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 16:39:43 2003 Subject: [Spambayes-checkins] spambayes MANIFEST.in,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv24923 Modified Files: MANIFEST.in Log Message: Include 'contrib' rather than 'hammie', and include *.el. Don't gratuitously include *.py from 'spambayes/resources' when that's already covered by the entry for 'spambayes'. Index: MANIFEST.in =================================================================== RCS file: /cvsroot/spambayes/spambayes/MANIFEST.in,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** MANIFEST.in 24 Jan 2003 20:43:05 -0000 1.2 --- MANIFEST.in 24 Jan 2003 21:39:36 -0000 1.3 *************** *** 1,6 **** ! recursive-include spambayes.resources *.py *.html *.gif *.psp recursive-include spambayes *.py recursive-include pspam *.py *.txt *.ini *.sh ! recursive-include hammie *.py *.sh *.txt procmailrc recursive-include Outlook2000 *.py *.txt *.ini *.html *.bmp recursive-include utilities *.py *.txt --- 1,6 ---- ! recursive-include spambayes/resources *.html *.psp *.gif recursive-include spambayes *.py recursive-include pspam *.py *.txt *.ini *.sh ! recursive-include contrib *.py *.sh *.txt *.el *rc recursive-include Outlook2000 *.py *.txt *.ini *.html *.bmp recursive-include utilities *.py *.txt From richiehindle at users.sourceforge.net Fri Jan 24 14:45:12 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 17:45:16 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.40,1.41 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv26242 Modified Files: pop3proxy.py Log Message: You can now click on a message subject on the Review page to view the message (thanks to Skip for the code). Darkened the stripes on the Review page slightly (though probably not enough for Skip's Powerbook 8-). Spread out the radio buttons a little, so that they line up better under their headings (without making their spacing uneven). Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** pop3proxy.py 24 Jan 2003 20:53:29 -0000 1.40 --- pop3proxy.py 24 Jan 2003 22:45:09 -0000 1.41 *************** *** 84,88 **** o Possibly integrate Tim Stone's SMTP code - make it use async, make the training code update (rather than replace!) the database. - o Allow use of the UI without the POP3 proxy. o Remove any existing X-Spambayes-Classification header from incoming emails. --- 84,87 ---- *************** *** 95,100 **** Code quality: - o Make a separate Dibbler plugin for serving images, so there's no - duplication between pop3proxy and OptionConfig. o Move the UI into its own module. o Cope with the email client timing out and closing the connection. --- 94,97 ---- *************** *** 557,564 **** clientSocket.getpeername()[0] == clientSocket.getsockname()[0] ! def _writePreamble(self, name, showImage=True): """Writes the HTML for the beginning of a page - time-consuming methlets use this and `_writePostamble` to write the page in ! pieces, including progress messages.""" # Take the whole palette and remove the content and the footer, --- 554,562 ---- clientSocket.getpeername()[0] == clientSocket.getsockname()[0] ! def _writePreamble(self, name, parent=None, showImage=True): """Writes the HTML for the beginning of a page - time-consuming methlets use this and `_writePostamble` to write the page in ! pieces, including progress messages. `parent` (if given) should ! be a pair: `(url, label)`, eg. `('review', 'Review')`.""" # Take the whole palette and remove the content and the footer, *************** *** 574,577 **** --- 572,578 ---- del html.homelink html.pagename = "Home" + elif parent: + html.pagename = "> %s > %s" % \ + (parent[0], parent[1], name) else: html.pagename = "> " + name *************** *** 799,802 **** --- 800,804 ---- row.subject = messageInfo.subjectHeader row.subject.title = messageInfo.bodySummary + row.subject.href="view?key=%s&corpus=%s" % (key, label) row.from_ = messageInfo.fromHeader setattr(row, 'class', ['stripe_on', 'stripe_off'][stripe]) # Grr! *************** *** 920,923 **** --- 922,935 ---- self.write(box) + self._writePostamble() + + def onView(self, key, corpus): + """View a message - linked from the Review page.""" + self._writePreamble("View message", parent=('review', 'Review')) + message = state.unknownCorpus.get(key) + if message: + self.write("
    %s
    " % cgi.escape(message.getSubstance())) + else: + self.write("

    Can't find message %r. Maybe it expired.

    " % key) self._writePostamble() From richiehindle at users.sourceforge.net Fri Jan 24 14:45:52 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 17:45:59 2003 Subject: [Spambayes-checkins] spambayes/spambayes/resources ui.html,1.1,1.2 ui_html.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes/resources In directory sc8-pr-cvs1:/tmp/cvs-serv26688 Modified Files: ui.html ui_html.py Log Message: You can now click on a message subject on the Review page to view the message (thanks to Skip for the code). Darkened the stripes on the Review page slightly (though probably not enough for Skip's Powerbook 8-). Spread out the radio buttons a little, so that they line up better under their headings (without making their spacing uneven). Index: ui.html =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/ui.html,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ui.html 17 Jan 2003 20:21:26 -0000 1.1 --- ui.html 24 Jan 2003 22:45:49 -0000 1.2 *************** *** 12,15 **** --- 12,16 ---- a:hover { color: #6040ff } .content a { font-weight: bold } + .viewlink a { font-weight: normal } .banner { background: #c0e0ff; padding=5; padding-left: 15; *************** *** 27,31 **** .reviewheaders a:link { color: #000000 } .reviewheaders a:visited { color: #000000 } ! .stripe_on td { background: #f4f4f4 } .uiHtmlHeader { font-size: 120%; color: #800000; font-weight: bold } --- 28,35 ---- .reviewheaders a:link { color: #000000 } .reviewheaders a:visited { color: #000000 } ! .viewlink a:link { color: #000000 } ! .viewlink a:visited { color: #000000 } ! ! .stripe_on td { background: #ececec } .uiHtmlHeader { font-size: 120%; color: #800000; font-weight: bold } *************** *** 205,217 **** ! Re: Spambayes and PyMeld rock! 8-) Richie Hindle <richie@entrian.com> !       --- 209,224 ---- ! ! ! Re: Spambayes and PyMeld rock! 8-) ! Richie Hindle <richie@entrian.com> !                  Index: ui_html.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/ui_html.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ui_html.py 17 Jan 2003 20:21:26 -0000 1.1 --- ui_html.py 24 Jan 2003 22:45:49 -0000 1.2 *************** *** 5,168 **** import zlib ! data = zlib.decompress('x\xda\xc5[{o\xe38\x92\xff\x7f\x80\xfe\x0e\x1c-\xba\x95`\x13\xdbyt\xdf\xacc\ ! \x0b\xdb\x9dd6\x83\xebG\xae\xe3\xb9\xc3`0h\xd0\x12ms#\x89:\x91\x8a\xe3m\xecw\ ! \xbf*>$\xea\x91\x8c\xfb\xe6\x16\x97\x1e$z\xb0\x8a\xc5b=~U\xd4\xcc\xbe\xbf\ ! \xfat\xb9\xf8\xe5\xf6\x9a\xdc,>\xbc\'\xb7?\xbf{\xff\xd3%\t\x8e\xc7\xe3\xff:\ ! \xbb\x1c\x8f\xaf\x16W\xe6\xc5\xf9hrB\x16%\xcd%W\\\xe44\x1d\x8f\xaf?\x06\xd1\ ! \x8b\xeff\x1b\x95\xa5\xfa/\xa3\t\xfeU\\\xa5\x8c\xf0d\x1e\xe8\xab \xba+h\xb6\ ! \xa4;&\xc9\xcf\x92\x95\xe4\xa7\\\xb1rEc6\x1b\xeb\x01H#\xd5N_,E\xb2#_\xc9J\ ! \xe4jJ\xfe2yIh\xc9izD\xe4\x96KyD6,}`\x8a\xc7\xf4\x82d\xb4\\\xf3|J&\xe4\x9f/\ ! \xbeSt\tS\xeeA\x87\x83W\xa2\xcc`l\x8b\xc1\x8b\xef\xe84\xe5\xf9=<\x8fE*\xca)\ ! \xf9\xd3\x04~b\xfd\x92N\x1f8\xac\x9a%O\xbc\xdd\x88\x07XV\xf3\xee\xcd\xe4|\ ! \xb2Z\xe1\xbbQ\x0c\x02\xb1\\\x11j\x85;\xde2\xbe\xde\x80\x8cK\x91&f\xe2\xd1\ ! \x92\xe6\xb9\xa6_\xd2\xf8~]\x8a*O\x80I\x9d\xbc\xbc\xa8y\xff\xa0y_\x0c\xafs6vn;\x1b[\xe7\x87K\xbdj\ ! \xf4~\xbc\x08\xf43\xd4\xd4,\xe1\x0f$N\xa9\x94\xf3\xd0X|\x18\xd97\xb2\xa0\xb9\ ! \xa6\x007\xcd\x98\n\xa2\x19\xcf\xd6D\x96\xf1<4OFk\xbe\n\tM\xf9:\x9f\x87t)3\ ! \x9e$)\x0b\xc7\xd1\xab|)\x8b\x0b\x90\x038\xb4\xb8\xd9\x89\x8c\xda\xdcD\xed\ ! \xc9\x8aR$U\xac>\xd2\xac\x15\xa2\xe0\xf1\xe3\xcer\x9czt\xd4H(2\x86{\x15\x90M\ ! \xc9V0\x01\xdc\x87\xd1\r\xfc\x9e\x8d\xe9\xf04\xe0\x13\xb9\x9e\xe3\xd5Z]\x90\ ! \x8a\x8f0^\xb6En\xdf\x80\xa2\x8c&Qc\xc8"\xa3<\xbf4n\x17\xd8\x95\x05\xd6\r\ ! \x8d~g\x9b\xf3\x08\xa2\xaaY\x10\x18\x1el\xc7y\xa3\xf8"Zl\xb8$+\x9e\xb2#2\x8b\ ! E\xc2\xa2Z\n}wD\x12\xb6\xe29,^m\x18I\x85\xb8?\xa6yr\xbcb,5,\xc4J\xbf\xa90zs\ ! \x17\xbd\xdd\xd3Fuw\xac\x84@8"d\x01\x8f\x1f \x02\x8bJ\x1a\x06\x05g1\x0c\x00\ ! \n\x9dK\xcct\t\x84\xe5\x92A\xa4f\x84=\xaa\x92\xc6h\xd601\x04\x96\x9c\x17i\ ! \x95R\xfd@\x19\x16e\x95+\x9eAd\x11$\xd9\x81B!\x98\xa7\xe9\x8e\x98]dZ\x12\xcd\ ! [m\xa8j\xcbe\xe8\x8dpD\xe2\x1fI\xaa\x82\x1c\xc3(\xab\x16\x02s\x83o\xc1\x7f\ ! \xe0\xac)S\x8a9Q\rm,\xb2B\xe4\xa0n\t\x8b3\x1a\xbc\xdd}`i\xf2\x1e\\\xd1*\x91p\ ! \xa3\xbe\x0c\xe4\xc1\x00\x88b\x80p\x0f\x99\x13Q\xb0\xfc \xb0\xb1 8"A\xa9\x82CH8498\xdc\ ! \x93S"b`\xd4\x080\xc28\x05\xe1\x8a\x95\x07\xc8t_6E\ta\x05\x99\x8d\x9a\xcdu\ ! \xfb\x1e\xbdJ\xd5\xc5\xf0\xee\xd7;\xfc\x9c\x1d\x8c1\xee\xce\xc6<\xda\x7fI\ ! \x9e\x14#\xcd\r\xd6\x18\\nh\xbefI\xf0\xff\xb8$\'BwIc0\xb3V\xfah<\xf5Nh\xbb\ ! \xb6\x8e\x8d\x1e\xa5\xa3D\xc5!\x85\xd7\x06\xdf\t\xec\x98\xb2$\x88\xced\x1e\ ! \xda\xe8\x8b\xf1\x05\tW\x14\x80-]\n\x90\xbc\xa0Ra\xecB\xcc\x91\xaf1\xf8\x00\ ! \xf8\x03\xbf?"\xa24\x13\xe0[\xcd\xbf\x89\x9f\x86\xdb\xaa\x14\x19\x91qI\x15D3\ ! n\xc4\x82\xc8\xfa6O\xfa\x19\x08\xa3\x08&\xbbR\xa4)\xc8\x00n\xc6 \xec/w.\xa6\ ! \xc2k`m\xe6\xd1\xb1\xfc\x98\xc4\xa8&\x9c\x1c\x99i\xef\x93\x1b\xc6\xd4\x11QX\ ! \xbe\x80\xaf\xe3;X\xae\xb0\xd9\r\xa5\x86\x10\x9c\xaf+\\8\xc4D\xed\x88\x10\ ! \x0es\xb6m\x82mGI\xc7:F\x9b\x1c(Y-\rj\x023\x1a\xe4\x9a\x1daT\xee\xba\xa1\xf3\ ! \x1dK!V\xe2\x10\x13\xe3{i\xb5Q\x16\x8c\x10 \x07\x06N\x1c\xc8\xcb\xc18.\xeb\ ! \x19\x10\x08\x94\x18\xecf\x85\x83\t>\xce\x0b"\r\x87\x92w\x18[=\x99<\xf8e_\ ! \x07\xce\x98\x0c \xb7`\xca\x07\xe9!\x89Y\x9a\x02e\x8c\xb5I\xd8\x04\xb0e\xdc<\x80\x08\xea\x9b\xa8\xbbw\xe54\xdc\xb7\rl\x10\xbc\ ! \x1b\xce\xc1S\xf5\xc3\xe3\xee\n\xea]\x9e\xb6F\xdc~\xba=3U\x04\xa2\xa7\x1c#\t\ ! \xe8h\xb6\x8c:\xa4\xb7\x90\xba\xe5\x9d\x0e\x9eAtrr2\xb1\xfa\x98\x8d\x97\xd1\ ! \x91\xbf\xbd0V\x073\xd1f\xa2\x91lYs\xb0I\x04\xea\xf3\xccc4\xf2\r\xd60#48\x04r\xd4\x9b*\xa7\x1d\xb7\xf1\xa7\xa6\x9a\xe8\x0e\n~\x1c\ ! \x19D\xbe\xe8\xa3\xce\x04}\xce&hKC\xfd\xdc4J(\x9a\xee7\xcbu\x86\xdb`l\x87\ ! \xaf8&\xab=\'\xc9\xab\x0c+\x85\x0e\x7f\x02W\xd9\xd1\xf3d7}\xaa\xcd\xef\x12\ ! \xfd\x9c\xcb\xaad\x1d\xb2J?\xec\xaei\x81\xcb\'\xcc\xac\x0cR\x17\x96L\x9d\x95\ ! \xa0\xe4\xd3\xce\x1c(y\x9b\x7f\x9b\xe6\xa6O\xb2\xe9Rt$\xe9\x14\xa8\xb5\xb3`\ ! \xd0\xb3\xe8\xd6\xf6e\xde\xb0\xc7\xd0\x86J\xffw\x87\xc6\x8f\xde\xfe4u\xd9\ ! \x0f\xf6\xb2\xe2\xeb\xe7\x02\xb2\xeb\x06`O\'\xf4e\xedO\xde\xff\xfd\x0b\x00\ ! \xff\x18\xbb\x04z\x1aP=\x96\x02eS1v\x96\xbf\x0fK\xb9\x93\x8ae\x10\xb0-\xee\ ! \xe8X\x01\xb5\xfd\x023c\x18]\xda\x99M\x00E\xc0\x85\xfd\x83\x91\xaf\'/\x05\ ! \xd6q\xf4w\xe3\xa4\xe9\x16-\x004\xb6a\x87\xde\xe7\xe6\xa5\x8bW\x8bV\xfdn\x82\ ! \x96T\xa2\xac+P\xa8&M[\x0fR\x95\x02wb\xd2F\x14\xa7Bm\x97z`\xed{\x80\xfe\xa8\ ! \x04\x0f\xc4.\xd4F\xc8\x86\x85\xa1\xacU\xd4h\xc5\x08\x16F\x9f\xf5\xdfz<\xaaD\ ! \xeb\xa6\xce,{\xe2\x1c\xbbPT\xda0\xd21\x03\xb4V\xeb4\x80m\x12H\x87\x1a\x9aA\ ! \xd5k\xfc\xcd\xba\xdf\x11\xd9n8`\xd5\x9d]5\xe4e\xe2\xe0\xe3\x90\x06 \xf5^nX|\ ! \xaf\x1fC\x8d_\n(\x07\xb0\xc6\\VJ\x81^V\xc2b9\x86\xf5\xbc\x9e\xe3\x08\xc7b\ ! \xe3\x8a\x03Y\xb8@\xa6!T\xe6\x00\x16\x81Yx\xc5V\xac\x0c\xa1l\xa5\x0f\xa6Mc\ ! \xc8\xad\xa6t\xce>\xb2\xe5\xbf\x93\x1c\xa6\xc1\xb2V\xcb\xa2\x996\xd8\xc0\x10\ ! _q\x19\xd32!c\xa2\xb9\xc3_\x08\r\xf0\x1b\r\x82\xb8\x86#\xf0\x8c\xf5J\x1a\xb4\ ! k\xa8\xcdJ\xa4\x81\x11\x14mC\xa3\x1c\x07B\xd6\xa2\r|u\xd3\x9d\xea!\xf5~\x83\ ! \xf8` \x00\xc5\xfev\xbd\xf0\xe2\x80-\xf5u\xfd\x13nx\x92\x00p!\xd8B\x9b\x87\ ! \xa0E\x01j@\xf8f/Me\x14N\xfc\x18\xf0\x0c\x83\x1c\x8c\xdf\xd0\x9b\xabAr\x83,,\ ! \x90\x80w-\xe0\xe0\xee\xfb\xd8\xd7\xc3\x12-\td\xb5\xcc\xb8r\x12\xac\x85\'\ ! \xff;\xad\xc4p\xa8\xb3\x8c?V\xba[\xd4\x97\xa8\xa0$\xa3\x80l\x13.Q>`q\xe2u@[\ ! \x00\xc6\x84\xd9}\xa4@-\xec\'\xc4G\x18\xf9\xad\x02<\x15,\xbb\xd8\xb9\x0f\x16\ ! \xb1\x8a.\xb3\x16rl\xbc8.y\xe1\x96\x85\xf5\xf1\xf8\xef\xf4\x81\x9a\xa7a4\x1e\ ! \xcf\xbe\xff\xf5\xf2\xea\xed\xe2\xed\xaf\xb6\xd8\xacrc\x99"7\xf1\xe1\x00)\ ! \xf5\xd1\x10\x94\x9f\x0ba{\x14_\x1by\xf8\x8a\x1c@\xe1^e\x00\x94G(\x87$\xaf^\ ! \x91\xf6\x93Q\xca\xf25\xd4e\xd1\x9c\x9c\x1e6\xa4_\xdbZ\xd4V?\xef\x90\xfez\ ! \xf2\xdbEo\x189\xe00prA8\x99i2;\x01<\xf8\xf3\x9f\x0f\xdb\xc3\xbf\xf6\xb7J\ ! \x02\x1a\xd7\xadl`\x82\xd4\xbf\xf2\xdfF\xb8\xd3#\xfd\xe2 \x9c\x86\x87\x03\ ! \xe7\x17\xb8\xd0\x9a\xd2\xadh>\'g\xb8\xde\xfa\x05\xc8\x8b\x0fQk\xf0|\xd8L\ ! \xdc\x9c\xb6[2\xaf\xb5;R\xe2\xbd\xd8\xb2\xf2\x12R\xc2\xc1\xe1a\x9f\xfa\xeb\ ! \xf3\x0cu\xf0\x81h\x06\xf3\x97\x15\x1bX\xc3?\xdb\x8f\xbc[{i\xff\x8c\xc7\xbf\ ! \xfd\xd6\x14\x1e\xdaX\xf6\x8cN\xb7\x9f\xee\x16a\xd4\xad:BW\x07\x9b"\xcf?\xfd\ ! \xda7`@\xb4\xf0\xb2\xd1]\xb5t\xf9\xabUV\x02\x18\x8b>\xb8$\xec\x01\\*\t\x1e\ ! \xdbN5^\xeb\x15\xa3H\xf5c)\xb2\'^;\xb1[\xa7K\x10\x1c\xc4\xb6\xa4\x05:6\xd2\ ! \x0fc\x98\xa0\xf1\xb6i\xe3S!\xca\x12\x1eA\xa62Y\x05\xec-\x88\xec\xb5\xce\xe3\ ! \xe3\xff\x1d7\x9d\xf74/\xbc\xfa\x03\x9c \xb7i>\xf0\xf7\x0fp\xc1\xdc\xa8\xd9\ ! \xe0E\xfb\xc8\xc7X\xd62\x1al\x11\xb4\x1b\x1a\xde\xae\x7f\x16\xdb\xa6Q\xe0N\ ! \xef\xc2\x81\xc6\x02\xc4ol\xca\x07\x00\x92\xa6\x1ef\xc3\xfe\xbdi\xbf\x92R\ ! \xc4\xf7\xdf\x13\xf2\xc3\xf1\xe1\x13\xbd\tl\xbb}\x01\x06\x80d8#7,/\xed\ ! \xd8\x86\x9d\xb3\xc5\xbaZ=\xc0\xe2\xddn\xb9\xb6\xae5\x9eiVXy#\x9e\\\n<5@\xc9\ ! \xf4YF\x9e8\xec\xa0\x9b\x89\xc7^\x8f\x82\xe7~\x85\xda\xb4\x8bI\xc92\xf1\xa0\ ! \x8f\xe0\xebs\xe0\xc3\xbdk\xf0-\x94O\xffQ\xb1r7\xec\x06\xf5\xeb\xc6\x13\xf0\ ! \xd1\x7f\xe3\xa3\xb0\xafB|\x178M\xc1uc\x8f^\x1d\x10\x9a\xd3\xb3\xf0l\xd2\xdd\ ! \x85\xe1\xa0\x00\xa0\x0c\xcc\xd5\x1eg\xb9\xd3\xebdH\xe5{\xad\xf6NQ%\x87\xfb\ ! \r\xf5k\x87\xed>V\xd9\x12<\x16\nj\xb4\xd9\xba\xe3\xd1i\xcc\xe1\xbbXT\xf8\x99\ ! \xc7\xc9\xe9\xd9\x13\xed\xcf\x86\xd5\xe6iN\x9b\x9a\xd1\xf9\xeb7O0\xba-\xc5\ ! \x92.9\xa0\xfe\x9d)\xe9\x9b@\x12\x9bS]\xd3\xb8\xb1z\xc2\xb3\x0b\x14\xd0\x85\ ! \x8f\xae\xe0`R\xcb \x9a\x8c\xfe\xed\x87\xbf<1\xe1\xb7\x84Tg\xbc\x9f\x99\x848\ ! \xf8\x84\x9a;\x83\xbc\xd6\x8e\xeei\x14\xcd\x02Q=\xae\x15\xef\x9e\xa1\xb0Z\ ! \xcd\x08\x98\x8b\xa8\xcf\xbcb\xd2;\x1c\xc3\x1f\xed\x83\xa6$\x00\xfd\xeb\x11d\ ! -\xec!\xca\x11I\xf9=3\n[\xa2\x1dC\x8c \xf5Q\xcf\xa1\xffqQs\x96\xa1y,\xfc\xc6\ ! T3\xa8lF\xe8,\xda\xcf\xf8\x0e\xe9\xe1\x98\xc0\xb5f\xbb\xdf\xdcA@\xbd\xb6_n\ ! \xe0>\xf6K\xf7\x16\xab!\x05\xf5\x08\xbc\x14:\x94F\xfd\xeeq\x01\x00VUe\xdet\ ! \xff\xda\xdfP\xe1\xe9\xb1\xdbG\x08[\xfapv:\xbc\x1ff\xd0[3&0\x01\xd1\x05N\x1b\ ! \x12\xdd^\x1c\x0e\x7fp\xb5\x8f\xe1\xe9N\xed\x8f\xc0\xed\t\x9b\xab\xdf\xd7\ ! \xe6\xf6\xe3\xa7\xcf\x1f\\T\x0b\xf4a4\x13\x05\x98\xa3\xcb\xec\x01f\xf6\xa0\ ! \xf7\x81\x16v:\xb1\xcd\x06\xf9O7\x1au\xeb\r\x89\xa1|,\xd1\xfdH\\I%2\xfe\x0fc\ ! p\x85;M\xa1\xca\x9d\x907\x1f\x1am\xe9\x0e\r\xce\xb4-[-^\x88\xee\x92I\xd3\xf1\ ! \xd6\x9d\xc7\xd1\xef-\xec\xb2\xf9\xd6L\x0fY\xbc}\xf7\xfe\xda\x1b3|\xf26p\xd2\ ! \xb6\xf8\xecQ\x81\t\x9f\xb4J\xdd\xc5\x95\xc5z\x81\xb6\xd4\xc0\xafF;\xe6V\x0b\ ! \x99\xd2%K\x83\xe8=\xfeq\x1f\xed\xf5Oj\x17W\xedy\xba\xec~\xfax\xfb\xf3\xa2\ ! \xf5\xc1\x85\xf9\x0c\xe3\xf4M\xa0g\xd1I\xc4\xa6\x1fSE\x01\x0e\xcb\xbe@@\xfc\ ! \x92\xd1"h\xe3\xca\xeed\xbaY\xb0ai\x01\xbb\x10c\x13\x17\x01\x8e>\xf7>\x99L\ ! \xc2\xe8\xf9c\x85g\xd8\x99\x83\xf0\x16\xb3\xa7\x0eV\x1cb\xb8\x012\x82\x0bl\ ! \x1c\xc3e\xf4\x01\xf9\xe1\xf2\xf3S;w\xda\xdd\xb9f\xaf\x1c`\xd7\x9b\x18F\x97U\ ! Y\xe2\xf7\xc2\xff\x89Iw:\xb0\x1eo\x97\xed\x9a\x1b\x134\xb4\x9a\x14\xfc[\xe7\ ! \xed\xc3:\x99<\'\xa8W\xb4\xc00O\x97\xfe\xd8\xb16\xe4~L0\x1fY\xe8z\xa3\x96\ ! \xd5\x07\x12\x16\xd09\x83\xd8`\to\xbf\xd0\xb9\xa3\x0f,h\xf2[\xcd\xc4=\xc0\ ! \xd0\xd0\xdc\xb6\x02E\xc9\xf4\xc9\x0bT\x9fT\'\xaf^\xb4\xf8\x03b}6\xbc\xb1\ ! \xd1o\x98\x0f\x88\xd8\x12\xf0\x9b\xcf\xe77\x95J\xc46\xb7\x8d\xaa\x81\xc3\xa7\ ! \xce\x88\x00\x1bLw\xf6\x99\x06\x06\xe4oB$\xcb]\xf3\xad\xa1\x9d\x12\xbf.v\x9f\ ! \xc4\xc2u\x8d WB(\xac\xdb\x1d|\x94\xa0\xfa~\xd3\xce\xa6V{\x88\x08N\xf2r\xa8\ ! \x1b7\xf3j\xd5d\xf8+\xe5\xe6#\xe0\xd0\xcc\x8c\t+l\xa5\xb0&\xcc\xde\x9a\x8f\ ! \x88\xa9\x7f\x90\xef\x9dk\x03\x92\x96\n2p\x10}\x109lJL\xce&\xe4\xe4|:9\x9f\ ! \x9e\x9d\x92\xd3\xc9\xe4\xd4\xea~\xd4\x9a\xdb\xce\xa5T1\x1d\x8f\xb7\xdb\xedH\ ! \xba\tG\xa2\\\x8f=\x01\xf0\x1e\xa7\xef\x9c\xbb\xb6<\xb3\xb3\xcc\xd6&\xe9\x88\ ! \x8e\x01\xa6\xe5\xe6CX\x1a\xf5\xe1\xa8\xecA\x82\x85\xe9`\x7f\xfd\x9e\x84\x85\ ! \xde\xe8&\xe4\x15h\xe0\x828b,,\x8d\xb4\x16L4 \xc2\x03\xe2`)"\xd9\x99/\xce\ ! \xcd\xffv\xf2?\x98y\rX' ) ### end --- 5,168 ---- import zlib ! data = zlib.decompress('x\xda\xc5[{s\xe36\x92\xff?U\xf9\x0e\x08\xb7fh\xd7\xda\x92l\xcf\xccfe\x89\ ! \xb53\xb6\xb3N\xdd<|\xb6\xb2W\xa9Tj\n"!\x89k\x92\xe0\x91\xa0e\xed\xd4~\xf7\ ! \xebn\x00$\xf8\xb0\xa3\xd9\xd4\xeeyR6\x89G\xa3\xd1\xe8\xc7\xaf\x1b\xcc\xec\ ! \xbb\xcbO\x17\x8b\x9fo\xae\xd8\xf5\xe2\xc3{v\xf3\xd3\xbb\xf7?^0\xefx<\xfe\ ! \x9f\xb3\x8b\xf1\xf8rq\xa9;^\x8d&\'lQ\xf0\xac\x8cU,3\x9e\x8c\xc7W\x1f\xbd\ ! \xe0\xdbof\x1b\x95&\xf4W\xf0\x08\xff\xaaX%\x82\xc5\xd1\xdc\xa3\'/\xb8\xcby\ ! \xba\xe4;Q\xb2\x9fJQ\xb0\x1f3%\x8a\x15\x0f\xc5lL\x03pN\xa9v\xf4\xb0\x94\xd1\ ! \x8e}a+\x99\xa9)\xfb\xf3\xe4\x05\xe3E\xcc\x93#Vn\xe3\xb2}\x88a\xd7\ ! "z\xa2w#\x1f`[M\xdf\x9b\xc9\xab\xc9j\x85}\xa3\x10\x18\x12\x99b\xdc0w\xbc\x15\ ! \xf1z\x03<.e\x12\xd1\x88\x87Xli\xe5\xde\x90\x0c\xf8\xe4\x89\xe6n\xb4\xe4YF\ ! \x8b,yx\xbf.d\x95E\xb0R8\x11\xb0\xd29\xcby\x14\xc5\xd9z\xfe\xba~\xff\xf6\x1bV\xff,e\x11\x89\xe2X\xc9\x1c\xba\xf2GV\xca$\x8e\xd82\ ! \x01\x9aC\xe3\x96R)\x99\xf6\x86\x12\xdfx\xda\xc4\x10\xf1\\\xc6\xff\x100\xee\ ! \xec\xecEk\xdb\x8d\x94O^SG*\xca\x92\xaf\x85>/\x85\x02\xed\xf0+\x1e\x9b-\x14Z\ ! \x0e\xd0FsK\x11\xa2\xe6\xd9\xb3\xd6<\xba\xcc\xfd\xe1\xfb\t\xfe;g\xdb8R\x1b\ ! \xd0\x83\xd7/\xdc\x89\xc81\x90\xed\xc8p\xb5Zq19\x1f\xe0\xc3\x15\xc8o\n\xc6\ ! \xae=4i\xf8\xe0\rWF\xe3\xcd\xf2@r\x92?\x9a3/\x04*\x87\x16t\xc9\x06\x15tB*\ ! \xd8\x1b9\xac\xaffp\xa3q\xcfPt\x06=G\x0c\xf7\xa1\x8a8\x17\x9fe\xa6\xcf\xb3\ ! \xa5\x9f"\xc4\x7ff`\x15_\x83\xa7\xb8\x1e\xd2\x9b\xd3\xc9\x8b\xf3\x9a\xfa\xf7\ ! D\xfd|Xl\xb3\xb1u\x15\xb3\xb1q8\xf0HBD\x8f\x83\x0f\x1e\xb5\xa1\xe0gQ\xfc\xc0\ ! \xc2\x84\x97\xe5\xdc\xd7\x06\xe4\x07\xa6\xa7\xccyF3\xc05\xa4By\xc1,N\xd7\xac\ ! ,\xc2\xb9\xaf[F\xebx\xe53\x9e\xc4\xebl\xee\xf3e\x99\xc6Q\x94\x08\x7f\x1c\xbc\ ! \xcc\x96e~\x0e|\x00\x85\x165\xb3\x90>\x05\xbbP{\xb1\xbc\x90Q\x15\xaa\x8f\ ! \xbc\x0c\x98XFk\xbc\\\xabsV\xc5#\xf4\xd1m\x96\xdb/ (-I\x94\x18\x92Hy\x9c]h+\ ! \xf6\xcc\xce\x02O_\'\x1e@z\xe2\xc2?m~\x80.-1_\xe3\xccRqU\x95\x1ag\x9a\x89\xaf\ ! &>\x841D\xbc\xf4l\xc1\xa6\x83\xf1\x90O\xe7\xd5l\x1av\xd0\x01k-\xbe\x9c\x19\ ! \x04\xcb0H2G\xf0`\x05\x1fd\xa9\x861Wi\x91\x14hOC\x07\x8c\xbf\x84\xe5@z`Ih\nV\ ! \'\r\x14k\xe6\x805\xb7G;\xaa\xabS\xaaG\x18\xa8\x83w6\x88\x15\xd1\x05\xf0\x07\ ! \t\xa1.\xaa\xf2\x04\xe2#n\xccb\x91\x91uA\xf6\x00\xe1\x17\xea\x8a\xd9\xb3A\ ! \xeb\xcbb\xdc\x05\xbe{h\xae>\xa1\x96\xdaj\xddD\x99\xeb\xce\x05\xbe{&/\x9c{\ ! \xe0B]\x15\xb5\xef6?\x87\xf7\xb6\x82\r\xc2wM\xd9{*\x83x\xdc]B\x02\x1d\'\xad\ ! \x117\x9fn\xcet\x1e\x81\xf8)CO\x022\x9a-\x83\xce\xd4\x1b\x88\xdd\xe5\x1d9O/8\ ! 99\x99\x18y\xcc\xc6\xcb\xe0\xc8=^\x18K\xceL\xb6\x89\x10\x96-j\n&\x8a@\xc2\ ! \x9f:\x84F\xae\xc2\xea\xd6\xe6\x00\xf0\xe7-\xe8\xe6\x83\xd0L\x83A E:\xd4r\ ! \xda1\x1bwiN\x93\xee\x04\x80S\x18\xe9\x05.\xeb\xa3\xce\x02}\xca\xdai\x97z\ ! \xf6s\xcb(\xa9x\xb2\xdf*W)\x1e\x83\xd6\x9dx\x15c\xb0\xdas\x91\xacJ1W\xe8\xd0\ ! g\xf0\x94\x1e=?\xed\xba?k\xf3\x9b\x93~\xca\xca\xaa\x10\x9di\x155v\xf7\xb4\ ! \xc0\xed3\xa1w\x06\xa1\x0b\x93\xa6\xceN\x90\xf3ig\r\xe4\xbcM\xbf=\xe7\xba?e\ ! \xd3\x9d\xd1\xe1\xa4\x93\xa2\xd6\xc6\x82N\xcf\xe0[S\xe8y#\x1e}\xe3*\xdd\xdf\ ! \x9d9\xae\xf7v\x97\xa9\x13\x7f\xd0\x97U\xbc~\xce!\xdbz\x00\x16\x89|\x97\xd7\ ! \xfe\xe2\xfd\xdf?\x03\xf4\x0f\xb1N@\xcb\x80\xe81\x19(\x9a\x9c\xb1\xb3\xfd}H\ ! \x96\xbbR\x89\x14\x1c\xb6\xc1\x1d\x1d-\xe0\xa6b\xa0W\xf4\x83\x0b\xb3\xb2v\ ! \xa0\x08\xb8\xb0\x820r\xe5\xe4\x84\xc0\xda\x8f\xfe\xa6\x9f\xd4\xe5\xa7\x05\ ! \xa0\xc66\xec\xa0sn:\xad\xbfZ\xb42x\xed\xb4J%\x8b:\x07\x85|R\xd7\t!T)0\'Q\ ! \x1a\x8fbEHzI\x03k\xdb\x03\xf4\xc7K\xb0@\xacCmd\xd9\x90\xd03k\x115R\xd1\x8c\ ! \xf9\xc1-\xfd\xad\xc7\xa3HH6ud\xd9\x13\xe7\x98\x8d\xa2\xd0\x86\x91\x8e\x1e@R\ ! \xad\xc3\x00\x16J \x1c\x124\x83\xbcW\xdb\x9b1\xbf#\xb6\xdd\xc4\x80Uwf\xd7\ ! \x10\x97\x99\x85\x8fC\x12\x80\xd0{\xb1\x11\xe1=5C\x96_H\xc8\x070\xcb\\VJ\x81\ ! \\V\xd2`9\x81\x19=\xadq\x84c\xb1t\x15\xc34\x7f\x81D}\xc8\xcd\x01,\x021\xffR\ ! \xacD\xe1C\xe2\xca\x1ft\xa1FO7\x92\xa2\x98}d\n\x00\x96sX\x06\x13[\xe2\x85\ ! \x886\xd8@O\xbe\x8c\xcb\x90\x17\x11\x1b3\xa2\x0e\x7f\xc15\xc0oT\x08f+\x98@3\ ! \xa4\x9d4hW\xcf\xd6;)5\x8c\xe0\xa8\x1b\x84r,\x08Y\xcb6\xf0\xa5R?\xa7!\xf5y\ ! \x03\xfb\xa0 \x00\xc5\xfez\xb5p\xfc\x80I\xf6)\x01\xf27q\x14\x01paXD\x9b\xfb \ ! E\tb@\xf8f\x1euj\xe4O\\\x1f\xf0\x0c\x81\x0c\x94_\xcf\xd7O\x83\xd35\xb20@\x02\ ! \xfaZ\xc0\xc1\xbe\xf7\xb1\xaf\x83%Z\x1c\x94\xd52\x8d\x95\xe5`-\x1d\xfe\xdf\ ! \x91\x10\xfd\xa1R5\xfe\x18\xeenP^\xb2\x82\x94\x8c\x03\xb2\x8d\xe2\x12\xf9\ ! \x03\x12\'N\r\xb4\x05`\xb4\x9b\xdd\x87\x0b\x94\xc2~L|\x84\x91_\xcb\xc0S\xce\ ! \xb2\x8b\x9d\xfb`\x11\xd3\xe8"m!\xc7\xc6\x8a\xc3"\xce\xed\xb60A\x1e\xff\x9d?\ ! p\xdd\xea\x07\xe3\xf1\xec\xbb_..\xdf.\xde\xfeb\x92\xcd*\xd3\x9a)3\xed\x1f\ ! \x0ep&]HA\xfa\xb9\x90\xa6H\xf1\xa5\xe1\'^\xb1\x03\xc8\xdc\xab\x14\x80\xf2\ ! \x08\xf9(\xd9\xcb\x97\xac\xdd2JD\xb6\x86\xbc,\x98\xb3\xd3\xc3f\xea\x97\xb6\ ! \x14I\xeb\xe7\x9d\xa9\xbf\x9c\xfcz\xde\x1b\xc6\x0eb\x1889g1\x9b\xd14\xb3\x00\ ! 4\xfc\xf1\x8f\x87\xed\xe1_\xfaGU\x02\x1a\xa7b6\x10\xc1\xd9\xbf\xc4\xbf\x8e\ ! \xf0\xa4G\xd4q\xe0O\xfd\xc3\x81\x0b\x11\xdch=\xd3\xeeh>gg\xb8\xdf\xba\x03\ ! \xf8\xc5F\x94\x1a\xb4\x0f\xab\x89]\xd3\x94K\xe6\xb5tGJ\xbe\x97[Q\\@H88<\xec\ ! \xcf\xfe\xf2\xe0\xcd`\xfd\xa2\x12\x03{\xf8g\xbb\xc9y5\x8f\xe6\xcfx\xfc\ ! \xeb\xafM\xe2A\xca\xb2\xa7w\xba\xf9t\xb7\xf0\x83n\xd6\xe1\xdb\xbc\x01\xd0\xf5[V\xc8\xf0\xfe;\xc6\xbe?>\xec\xdcf\ ! \rW:\xb0\x88\xf7\xd9\x0bn\x01\x17\xc5\x82]\xc7Y\x04\x86\x81\x95\xcd\x82Z\xfe\ ! \x02\xfe\x0e0O\x86\t\xa9\xaeW\xf6\xa8\x18\x08\x1fb=\xa2\xf0{\x15\x96^(/x\x14\ ! K\x1b\xc1\x8c%\xec\xa6d\x04\xffu\xf5\xf3p\x00CN#\xad\x86\xb6\x8e\xea\x9b\xf7\ ! :x\xf5\x7f\xff;\x98@\xfd\xf5\x9axJ\xef\xffY\x160\xcdsC:\xbc\xffg\x19\xa0\xd4\ ! \xd4a\x00\xdf\xfd\xf1\xde\x16\xb3Lxv\x8f\x06\x134\xe0BW\x9b\x9e\x7f\xedS"\ ! \x8c\xbc\x07\xa5=\xf4\xb5\x93+\xf6deq\x97\xd9\xb2\x06\xf6{\xec\xf9i8\xf45u\ ! \xb3*O$\x8f\xfaH\x1c\x85\xa0\xfb\xbc:\xee\x91PZ\x87\xd7\x8a\x80L\x00\x8e\xa2\ ! -\xa5U\xa2\xe2\x9c\x17\x8a\xb8:\x8e\xb8\xe2\xd6%]\xc5TL\xd7\xa4\x19\xaf\x13\ ! \x94&\xf5\x92\xc5\xe7\x14\xaf:\x02\xc0<)\xd5{i7T\xbbo\xdf\xf6\xe9\xc5\xb0\ ! \xdd\xaa\x9a~6\x92\x04!:\xb2\xffT\xd0\x85\x84\xa0\xe4c\xbb\x91I\x9dv\x02\xb2\ ! \xca\xc2\x8a*\xfa&\xce\x1dR\xbe4uK\x91\x08#!\xf1\xe3f!E\xc9A!\xb7\xe0u\xcft\ ! \xa5z\xee\xe3\xc5\x1e\x1c\x8b\x19\xe9.\xee\\O\xea\xe3\xfe\xac5}\x08yS"\xd9V\ ! \x07\x0c\xe5w\xae)\xf4\xe9m\xbe\x92\xdc\xf5\xb3\xd4\xac\xc9\xeeC\xf2\xc2\x8c\ ! m\xc8Y]\xac\xf3\xe6\x03,#\x98#\'\xedZ\xe3\xfdj\x855\x00D\xb6K\x89\xf7\x17\ ! \xc8\x19\xdd\xaad\x91E1T\xd6:\x19\x89\xaf/\xf2\xfc\xb3I\xf7\x14\x86\x9d\x02\xc0CPWs\ ! \xb1fo\xd2\xa3!\x91\xef\xb5\xdb;\xc5U9\\\xf9\xa8\xbb-4\xf8X\xa5K\xb0XH\xedQg\ ! \xeb\xdaK\xa7D\x88}\xa1\xac\xf0\x93\x93\x93\xd3\xb3\'\n\xb1\r\xa9\xcd\xd3\ ! \x9465\xa1W\xaf\xdf\xf1J\x94\xce5\x1d\xfe\x90\r\xea\xe4\x04\xe4O#\xd8Z\x9a\xeb\x9c#\ ! \x96\xc4\xf7B\x0bl\x89z\x0c>\x82\xd5\x97N\x87\xee\x87N\xcd\xad\n\xd1X\xb8%\ ! \xb2fP\xd1\x8c\xa0(\xda\x8f\xf8\x16%\xe2\x18\xcf\x16\x89\xbb\x9f\x13\x82C\ ! \xbd2_\x91\xe09\xf6\x8b\x08-RC\x02\xeaMpB\xe8P\x18u\xeb\xd8yp+TUdM\x1d\xb2\ ! \xfd=\x17\xdec\xdbs\x04\xb7E\xd7\xc4\xd3\xe1\xf3\xd0\x83\xde\xea1\x9ev\x88\ ! \xd6q\x1a\x97h\xcf\xe2p\xf8\xe3\xaf}\x14\x8fj\xc6?\x00\xb5\'t\xae\xee\xaf\ ! \xd5\xed\x87O\xb7\x1f\xacW\xf3\xe8Z\\\xc8\x1c\xd4\xd1Fv\x0f#\xbb\xd7\xfbX\ ! \x0ck\xaeX\xf0\x83\xf8G%O*\x02\xe2dHd\x0b4?\x16V\xa5\x92i\xfc\x0f\xadp\xb9\ ! \xbd\xd7\xe1\xca\xde\xd57\x1f=m\xf9\x0e\x15N\x17P[\xc5f\xf0\xee\xa5(u\xed\ ! \x9dj\xa0\xa3\xdf\xda\xd8E\xf3\xdd\x1b\rY\xbc}\xf7\xfe\xca\x193|\x078p\xe7\ ! \xb7\xb8uf\x81\n\x9f\xb4\x92\xee\xc5\xa5\xc1z\x1ei\xaa\xe7\xe6\xc5\x1du\xab\ ! \x99L\xf8R$^\xf0\x1e\xff\xd8\x0f\x08\xfbw\xc6\x8b\xcb\xf6:]r?~\xbc\xf9i\xd1\ ! \xfa\xf6C\x7f\x11r\xfa\xc6\xa3U(\x88\x98\xf0\xa330\xc0a\xe9gp\x88\x9fS\x9e{m\ ! \\\xd9]\x8c\xca\x16\x1b\x91\xe4p\n!\x96\x93\x11\xe0\xd0\r\xfc\xc9d\xe2\x07\ ! \xcf_p\x16\xba\xe3\x0f\xc2k\ ! \xe2[M\xc46\xa0kh^[\x8e\xa2\x10t\x07\x04\x19*\xa7\xe0\xd5\xf3\x16\xbf\x83\ ! \xad[M\x1b\xaf\x1c4\xf1\x01\x16[\x0c~\xf5\x97\x02\x9bJEr\x9b\x99\x92\xd9\xc0\ ! 5Xg\x84\x87\xa5\xae;\xd3F\xc0\x80\xfdU\xcah\xb9k\xbe{4K\xe2\x97\xce\xf6\xf3*\ ! x\xae\x11\xe4JJ\x85\xb9\xbd\x85\x8f%\x88\xbe_>4\xa1\xd5\\g\x82\x91\xbc\x18\ ! \xaa\x0b\xce\x9c\\5\x1a\xfeb\xba)\xf9\xf8ze\x0cX~+\x845n\xf6F\x7f\xd0\xcc\ ! \xddO\n\x9c\x1bv@\xd2\xa5\x82\x08\xec\x05\x1fd\x06\x87\x12\xb2\xb3\t;y5\x9d\ ! \xbc\x9a\x9e\x9d\xb2\xd3\xc9\xe4\xd4\xc8~\xd4Z\xdb\xac\xa5T>\x1d\x8f\xb7\xdb\ ! \xed\xa8\xb4\x0b\x8ed\xb1\x1e;\x0c\xe0;.\xdf\xb9\x01nYfg\x9b\xadC"\x8f\x8e\ ! \x0e\xa6e\xe6CX\x1a\xe5ag\x99+\r\x03\xd3A\xff\xfa5\t\x03\xbd\xd1L\xd8K\x90\ ! \xc09\xb3\x931\xb1\xd4\xdc\x1a0\xd1\x80\x08\x07\x88\x83\xa6\xc8h\xa7\xbf~\ ! \xd7\xff\xdb\xcd\xff\x01\x85\xbcX&' ) ### end From richiehindle at users.sourceforge.net Fri Jan 24 15:56:13 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 18:56:19 2003 Subject: [Spambayes-checkins] spambayes/spambayes/resources ui.html,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes/resources In directory sc8-pr-cvs1:/tmp/cvs-serv31854 Modified Files: ui.html Log Message: The configuration page now tells you where your ini file is. Index: ui.html =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/ui.html,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** ui.html 24 Jan 2003 22:45:49 -0000 1.2 --- ui.html 24 Jan 2003 23:56:11 -0000 1.3 *************** *** 299,304 ****
    !

    This page allows you to change certain customizable options that control ! the way in which Spambayes processes your email.

    --- 299,305 ---- !

    This page allows you to change the options that control how ! Spambayes processes your email. Your options are stored in ! /example/pathname.

    From richiehindle at users.sourceforge.net Fri Jan 24 15:56:29 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 18:56:34 2003 Subject: [Spambayes-checkins] spambayes/spambayes/resources ui_html.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes/resources In directory sc8-pr-cvs1:/tmp/cvs-serv31977 Modified Files: ui_html.py Log Message: The configuration page now tells you where your ini file is. Index: ui_html.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/ui_html.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** ui_html.py 24 Jan 2003 22:45:49 -0000 1.2 --- ui_html.py 24 Jan 2003 23:56:27 -0000 1.3 *************** *** 5,168 **** import zlib ! data = zlib.decompress('x\xda\xc5[{s\xe36\x92\xff?U\xf9\x0e\x08\xb7fh\xd7\xda\x92l\xcf\xccfe\x89\ ! \xb53\xb6\xb3N\xdd<|\xb6\xb2W\xa9Tj\n"!\x89k\x92\xe0\x91\xa0e\xed\xd4~\xf7\ ! \xebn\x00$\xf8\xb0\xa3\xd9\xd4\xeeyR6\x89G\xa3\xd1\xe8\xc7\xaf\x1b\xcc\xec\ \xbb\xcbO\x17\x8b\x9fo\xae\xd8\xf5\xe2\xc3{v\xf3\xd3\xbb\xf7?^0\xefx<\xfe\ \x9f\xb3\x8b\xf1\xf8rq\xa9;^\x8d&\'lQ\xf0\xac\x8cU,3\x9e\x8c\xc7W\x1f\xbd\ \xe0\xdbof\x1b\x95&\xf4W\xf0\x08\xff\xaaX%\x82\xc5\xd1\xdc\xa3\'/\xb8\xcby\ ! \xba\xe4;Q\xb2\x9fJQ\xb0\x1f3%\x8a\x15\x0f\xc5lL\x03pN\xa9v\xf4\xb0\x94\xd1\ ! \x8e}a+\x99\xa9)\xfb\xf3\xe4\x05\xe3E\xcc\x93#Vn\xe3\xb2}\x88a\xd7\ ! "z\xa2w#\x1f`[M\xdf\x9b\xc9\xab\xc9j\x85}\xa3\x10\x18\x12\x99b\xdc0w\xbc\x15\ ! \xf1z\x03<.e\x12\xd1\x88\x87Xli\xe5\xde\x90\x0c\xf8\xe4\x89\xe6n\xb4\xe4YF\ ! \x8b,yx\xbf.d\x95E\xb0R8\x11\xb0\xd29\xcby\x14\xc5\xd9z\xfe\xba~\xff\xf6\x1bV\xff,e\x11\x89\xe2X\xc9\x1c\xba\xf2GV\xca$\x8e\xd82\ ! \x01\x9aC\xe3\x96R)\x99\xf6\x86\x12\xdfx\xda\xc4\x10\xf1\\\xc6\xff\x100\xee\ ! \xec\xecEk\xdb\x8d\x94O^SG*\xca\x92\xaf\x85>/\x85\x02\xed\xf0+\x1e\x9b-\x14Z\ ! \x0e\xd0FsK\x11\xa2\xe6\xd9\xb3\xd6<\xba\xcc\xfd\xe1\xfb\t\xfe;g\xdb8R\x1b\ ! \xd0\x83\xd7/\xdc\x89\xc81\x90\xed\xc8p\xb5Zq19\x1f\xe0\xc3\x15\xc8o\n\xc6\ ! \xae=4i\xf8\xe0\rWF\xe3\xcd\xf2@r\x92?\x9a3/\x04*\x87\x16t\xc9\x06\x15tB*\ ! \xd8\x1b9\xac\xaffp\xa3q\xcfPt\x06=G\x0c\xf7\xa1\x8a8\x17\x9fe\xa6\xcf\xb3\ ! \xa5\x9f"\xc4\x7ff`\x15_\x83\xa7\xb8\x1e\xd2\x9b\xd3\xc9\x8b\xf3\x9a\xfa\xf7\ ! D\xfd|Xl\xb3\xb1u\x15\xb3\xb1q8\xf0HBD\x8f\x83\x0f\x1e\xb5\xa1\xe0gQ\xfc\xc0\ ! \xc2\x84\x97\xe5\xdc\xd7\x06\xe4\x07\xa6\xa7\xccyF3\xc05\xa4By\xc1,N\xd7\xac\ ! ,\xc2\xb9\xaf[F\xebx\xe53\x9e\xc4\xebl\xee\xf3e\x99\xc6Q\x94\x08\x7f\x1c\xbc\ ! \xcc\x96e~\x0e|\x00\x85\x165\xb3\x90>\x05\xbbP{\xb1\xbc\x90Q\x15\xaa\x8f\ ! \xbc\x0c\x98XFk\xbc\\\xabsV\xc5#\xf4\xd1m\x96\xdb/ (-I\x94\x18\x92Hy\x9c]h+\ ! \xf6\xcc\xce\x02O_\'\x1e@z\xe2\xc2?m~\x80.-1_\xe3\xccRqU\x95\x1ag\x9a\x89\xaf\ ! &>\x841D\xbc\xf4l\xc1\xa6\x83\xf1\x90O\xe7\xd5l\x1av\xd0\x01k-\xbe\x9c\x19\ ! \x04\xcb0H2G\xf0`\x05\x1fd\xa9\x861Wi\x91\x14hOC\x07\x8c\xbf\x84\xe5@z`Ih\nV\ ! \'\r\x14k\xe6\x805\xb7G;\xaa\xabS\xaaG\x18\xa8\x83w6\x88\x15\xd1\x05\xf0\x07\ ! \t\xa1.\xaa\xf2\x04\xe2#n\xccb\x91\x91uA\xf6\x00\xe1\x17\xea\x8a\xd9\xb3A\ ! \xeb\xcbb\xdc\x05\xbe{h\xae>\xa1\x96\xdaj\xddD\x99\xeb\xce\x05\xbe{&/\x9c{\ ! \xe0B]\x15\xb5\xef6?\x87\xf7\xb6\x82\r\xc2wM\xd9{*\x83x\xdc]B\x02\x1d\'\xad\ ! \x117\x9fn\xcet\x1e\x81\xf8)CO\x022\x9a-\x83\xce\xd4\x1b\x88\xdd\xe5\x1d9O/8\ ! 99\x99\x18y\xcc\xc6\xcb\xe0\xc8=^\x18K\xceL\xb6\x89\x10\x96-j\n&\x8a@\xc2\ ! \x9f:\x84F\xae\xc2\xea\xd6\xe6\x00\xf0\xe7-\xe8\xe6\x83\xd0L\x83A E:\xd4r\ ! \xda1\x1bwiN\x93\xee\x04\x80S\x18\xe9\x05.\xeb\xa3\xce\x02}\xca\xdai\x97z\ ! \xf6s\xcb(\xa9x\xb2\xdf*W)\x1e\x83\xd6\x9dx\x15c\xb0\xdas\x91\xacJ1W\xe8\xd0\ ! g\xf0\x94\x1e=?\xed\xba?k\xf3\x9b\x93~\xca\xca\xaa\x10\x9di\x155v\xf7\xb4\ ! \xc0\xed3\xa1w\x06\xa1\x0b\x93\xa6\xceN\x90\xf3ig\r\xe4\xbcM\xbf=\xe7\xba?e\ ! \xd3\x9d\xd1\xe1\xa4\x93\xa2\xd6\xc6\x82N\xcf\xe0[S\xe8y#\x1e}\xe3*\xdd\xdf\ ! \x9d9\xae\xf7v\x97\xa9\x13\x7f\xd0\x97U\xbc~\xce!\xdbz\x00\x16\x89|\x97\xd7\ ! \xfe\xe2\xfd\xdf?\x03\xf4\x0f\xb1N@\xcb\x80\xe81\x19(\x9a\x9c\xb1\xb3\xfd}H\ ! \x96\xbbR\x89\x14\x1c\xb6\xc1\x1d\x1d-\xe0\xa6b\xa0W\xf4\x83\x0b\xb3\xb2v\ ! \xa0\x08\xb8\xb0\x820r\xe5\xe4\x84\xc0\xda\x8f\xfe\xa6\x9f\xd4\xe5\xa7\x05\ ! \xa0\xc66\xec\xa0sn:\xad\xbfZ\xb42x\xed\xb4J%\x8b:\x07\x85|R\xd7\t!T)0\'Q\ ! \x1a\x8fbEHzI\x03k\xdb\x03\xf4\xc7K\xb0@\xacCmd\xd9\x90\xd03k\x115R\xd1\x8c\ ! \xf9\xc1-\xfd\xad\xc7\xa3HH6ud\xd9\x13\xe7\x98\x8d\xa2\xd0\x86\x91\x8e\x1e@R\ ! \xad\xc3\x00\x16J \x1c\x124\x83\xbcW\xdb\x9b1\xbf#\xb6\xdd\xc4\x80Uwf\xd7\ ! \x10\x97\x99\x85\x8fC\x12\x80\xd0{\xb1\x11\xe1=5C\x96_H\xc8\x070\xcb\\VJ\x81\ ! \\V\xd2`9\x81\x19=\xadq\x84c\xb1t\x15\xc34\x7f\x81D}\xc8\xcd\x01,\x021\xffR\ ! \xacD\xe1C\xe2\xca\x1ft\xa1FO7\x92\xa2\x98}d\n\x00\x96sX\x06\x13[\xe2\x85\ ! \x886\xd8@O\xbe\x8c\xcb\x90\x17\x11\x1b3\xa2\x0e\x7f\xc15\xc0oT\x08f+\x98@3\ ! \xa4\x9d4hW\xcf\xd6;)5\x8c\xe0\xa8\x1b\x84r,\x08Y\xcb6\xf0\xa5R?\xa7!\xf5y\ ! \x03\xfb\xa0 \x00\xc5\xfez\xb5p\xfc\x80I\xf6)\x01\xf27q\x14\x01paXD\x9b\xfb \ ! E\tb@\xf8f\x1euj\xe4O\\\x1f\xf0\x0c\x81\x0c\x94_\xcf\xd7O\x83\xd35\xb20@\x02\ ! \xfaZ\xc0\xc1\xbe\xf7\xb1\xaf\x83%Z\x1c\x94\xd52\x8d\x95\xe5`-\x1d\xfe\xdf\ ! \x91\x10\xfd\xa1R5\xfe\x18\xeenP^\xb2\x82\x94\x8c\x03\xb2\x8d\xe2\x12\xf9\ ! \x03\x12\'N\r\xb4\x05`\xb4\x9b\xdd\x87\x0b\x94\xc2~L|\x84\x91_\xcb\xc0S\xce\ ! \xb2\x8b\x9d\xfb`\x11\xd3\xe8"m!\xc7\xc6\x8a\xc3"\xce\xed\xb60A\x1e\xff\x9d?\ ! p\xdd\xea\x07\xe3\xf1\xec\xbb_..\xdf.\xde\xfeb\x92\xcd*\xd3\x9a)3\xed\x1f\ ! \x0ep&]HA\xfa\xb9\x90\xa6H\xf1\xa5\xe1\'^\xb1\x03\xc8\xdc\xab\x14\x80\xf2\ ! \x08\xf9(\xd9\xcb\x97\xac\xdd2JD\xb6\x86\xbc,\x98\xb3\xd3\xc3f\xea\x97\xb6\ ! \x14I\xeb\xe7\x9d\xa9\xbf\x9c\xfcz\xde\x1b\xc6\x0eb\x1889g1\x9b\xd14\xb3\x00\ ! 4\xfc\xf1\x8f\x87\xed\xe1_\xfaGU\x02\x1a\xa7b6\x10\xc1\xd9\xbf\xc4\xbf\x8e\ ! \xf0\xa4G\xd4q\xe0O\xfd\xc3\x81\x0b\x11\xdch=\xd3\xeeh>gg\xb8\xdf\xba\x03\ ! \xf8\xc5F\x94\x1a\xb4\x0f\xab\x89]\xd3\x94K\xe6\xb5tGJ\xbe\x97[Q\\@H88<\xec\ ! \xcf\xfe\xf2\xe0\xcd`\xfd\xa2\x12\x03{\xf8g\xbb\xc9y5\x8f\xe6\xcfx\xfc\ ! \xeb\xafM\xe2A\xca\xb2\xa7w\xba\xf9t\xb7\xf0\x83n\xd6\xe1\xdb\xbc\x01\xd0\xf5[V\xc8\xf0\xfe;\xc6\xbe?>\xec\xdcf\ ! \rW:\xb0\x88\xf7\xd9\x0bn\x01\x17\xc5\x82]\xc7Y\x04\x86\x81\x95\xcd\x82Z\xfe\ ! \x02\xfe\x0e0O\x86\t\xa9\xaeW\xf6\xa8\x18\x08\x1fb=\xa2\xf0{\x15\x96^(/x\x14\ ! K\x1b\xc1\x8c%\xec\xa6d\x04\xffu\xf5\xf3p\x00CN#\xad\x86\xb6\x8e\xea\x9b\xf7\ ! :x\xf5\x7f\xff;\x98@\xfd\xf5\x9axJ\xef\xffY\x160\xcdsC:\xbc\xffg\x19\xa0\xd4\ ! \xd4a\x00\xdf\xfd\xf1\xde\x16\xb3Lxv\x8f\x06\x134\xe0BW\x9b\x9e\x7f\xedS"\ ! \x8c\xbc\x07\xa5=\xf4\xb5\x93+\xf6deq\x97\xd9\xb2\x06\xf6{\xec\xf9i8\xf45u\ ! \xb3*O$\x8f\xfaH\x1c\x85\xa0\xfb\xbc:\xee\x91PZ\x87\xd7\x8a\x80L\x00\x8e\xa2\ ! -\xa5U\xa2\xe2\x9c\x17\x8a\xb8:\x8e\xb8\xe2\xd6%]\xc5TL\xd7\xa4\x19\xaf\x13\ ! \x94&\xf5\x92\xc5\xe7\x14\xaf:\x02\xc0<)\xd5{i7T\xbbo\xdf\xf6\xe9\xc5\xb0\ ! \xdd\xaa\x9a~6\x92\x04!:\xb2\xffT\xd0\x85\x84\xa0\xe4c\xbb\x91I\x9dv\x02\xb2\ ! \xca\xc2\x8a*\xfa&\xce\x1dR\xbe4uK\x91\x08#!\xf1\xe3f!E\xc9A!\xb7\xe0u\xcft\ ! \xa5z\xee\xe3\xc5\x1e\x1c\x8b\x19\xe9.\xee\\O\xea\xe3\xfe\xac5}\x08yS"\xd9V\ ! \x07\x0c\xe5w\xae)\xf4\xe9m\xbe\x92\xdc\xf5\xb3\xd4\xac\xc9\xeeC\xf2\xc2\x8c\ ! m\xc8Y]\xac\xf3\xe6\x03,#\x98#\'\xedZ\xe3\xfdj\x855\x00D\xb6K\x89\xf7\x17\ ! \xc8\x19\xdd\xaad\x91E1T\xd6:\x19\x89\xaf/\xf2\xfc\xb3I\xf7\x14\x86\x9d\x02\xc0CPWs\ ! \xb1fo\xd2\xa3!\x91\xef\xb5\xdb;\xc5U9\\\xf9\xa8\xbb-4\xf8X\xa5K\xb0XH\xedQg\ ! \xeb\xdaK\xa7D\x88}\xa1\xac\xf0\x93\x93\x93\xd3\xb3\'\n\xb1\r\xa9\xcd\xd3\ ! \x9465\xa1W\xaf\xdf\xf1J\x94\xce5\x1d\xfe\x90\r\xea\xe4\x04\xe4O#\xd8Z\x9a\xeb\x9c#\ ! \x96\xc4\xf7B\x0bl\x89z\x0c>\x82\xd5\x97N\x87\xee\x87N\xcd\xad\n\xd1X\xb8%\ ! \xb2fP\xd1\x8c\xa0(\xda\x8f\xf8\x16%\xe2\x18\xcf\x16\x89\xbb\x9f\x13\x82C\ ! \xbd2_\x91\xe09\xf6\x8b\x08-RC\x02\xeaMpB\xe8P\x18u\xeb\xd8yp+TUdM\x1d\xb2\ ! \xfd=\x17\xdec\xdbs\x04\xb7E\xd7\xc4\xd3\xe1\xf3\xd0\x83\xde\xea1\x9ev\x88\ ! \xd6q\x1a\x97h\xcf\xe2p\xf8\xe3\xaf}\x14\x8fj\xc6?\x00\xb5\'t\xae\xee\xaf\ ! \xd5\xed\x87O\xb7\x1f\xacW\xf3\xe8Z\\\xc8\x1c\xd4\xd1Fv\x0f#\xbb\xd7\xfbX\ ! \x0ck\xaeX\xf0\x83\xf8G%O*\x02\xe2dHd\x0b4?\x16V\xa5\x92i\xfc\x0f\xadp\xb9\ ! \xbd\xd7\xe1\xca\xde\xd57\x1f=m\xf9\x0e\x15N\x17P[\xc5f\xf0\xee\xa5(u\xed\ ! \x9dj\xa0\xa3\xdf\xda\xd8E\xf3\xdd\x1b\rY\xbc}\xf7\xfe\xca\x193|\x078p\xe7\ ! \xb7\xb8uf\x81\n\x9f\xb4\x92\xee\xc5\xa5\xc1z\x1ei\xaa\xe7\xe6\xc5\x1du\xab\ ! \x99L\xf8R$^\xf0\x1e\xff\xd8\x0f\x08\xfbw\xc6\x8b\xcb\xf6:]r?~\xbc\xf9i\xd1\ ! \xfa\xf6C\x7f\x11r\xfa\xc6\xa3U(\x88\x98\xf0\xa330\xc0a\xe9gp\x88\x9fS\x9e{m\ ! \\\xd9]\x8c\xca\x16\x1b\x91\xe4p\n!\x96\x93\x11\xe0\xd0\r\xfc\xc9d\xe2\x07\ ! \xcf_p\x16\xba\xe3\x0f\xc2k\ ! \xe2[M\xc46\xa0kh^[\x8e\xa2\x10t\x07\x04\x19*\xa7\xe0\xd5\xf3\x16\xbf\x83\ ! \xad[M\x1b\xaf\x1c4\xf1\x01\x16[\x0c~\xf5\x97\x02\x9bJEr\x9b\x99\x92\xd9\xc0\ ! 5Xg\x84\x87\xa5\xae;\xd3F\xc0\x80\xfdU\xcah\xb9k\xbe{4K\xe2\x97\xce\xf6\xf3*\ ! x\xae\x11\xe4JJ\x85\xb9\xbd\x85\x8f%\x88\xbe_>4\xa1\xd5\\g\x82\x91\xbc\x18\ ! \xaa\x0b\xce\x9c\\5\x1a\xfeb\xba)\xf9\xf8ze\x0cX~+\x845n\xf6F\x7f\xd0\xcc\ ! \xddO\n\x9c\x1bv@\xd2\xa5\x82\x08\xec\x05\x1fd\x06\x87\x12\xb2\xb3\t;y5\x9d\ ! \xbc\x9a\x9e\x9d\xb2\xd3\xc9\xe4\xd4\xc8~\xd4Z\xdb\xac\xa5T>\x1d\x8f\xb7\xdb\ ! \xed\xa8\xb4\x0b\x8ed\xb1\x1e;\x0c\xe0;.\xdf\xb9\x01nYfg\x9b\xadC"\x8f\x8e\ ! \x0e\xa6e\xe6CX\x1a\xe5ag\x99+\r\x03\xd3A\xff\xfa5\t\x03\xbd\xd1L\xd8K\x90\ ! \xc09\xb3\x931\xb1\xd4\xdc\x1a0\xd1\x80\x08\x07\x88\x83\xa6\xc8h\xa7\xbf~\ ! \xd7\xff\xdb\xcd\xff\x01\x85\xbcX&' ) ### end --- 5,171 ---- import zlib ! data = zlib.decompress('x\xda\xc5[{s\xe36\x92\xff?U\xf9\x0e\x08\xb7fh\xd7\xda\x92l\xcf\xccee\x89u3\ ! \xb6\xb3N\xed<\xbc\xb6\xb2W\xa9Tj\n\x14!\x89k\x92\xe0\x91\xa0e\xed\xd4~\xf7\ ! \xedn\x00$\xf8\xb0\xa3\xb9\xd4\xeeyR6\x89G\xa3\xd1\xe8\xfe\xf5\x03\xcc\xec\ \xbb\xcbO\x17\x8b\x9fo\xae\xd8\xf5\xe2\xc3{v\xf3\xd3\xbb\xf7?^0\xefx<\xfe\ \x9f\xb3\x8b\xf1\xf8rq\xa9;^\x8d&\'lQ\xf0\xac\x8cU,3\x9e\x8c\xc7W\x1f\xbd\ \xe0\xdbof\x1b\x95&\xf4W\xf0\x08\xff\xaaX%\x82\xc5\xd1\xdc\xa3\'/\xb8\xcby\ ! \x1a\xf2\x9d(\xd9O\xa5(\xd8\x8f\x99\x12\xc5\x8a/\xc5lL\x03pN\xa9v\xf4\x10\ ! \xcah\xc7\xbe\xb0\x95\xcc\xd4\x94\xfdi\xf2\x82\xf1"\xe6\xc9\x11+\xb7qY\x1e\ ! \xb1\x8dH\x1e\x84\x8a\x97\xfc\x9c\xa5\xbcX\xc7\xd9\x94M\xd8?\xbf\xfdF\xf1\ ! \x10\x96\xdcc\x1e\x0e^\xc9"\x85\xb1-\x02\xdf~\xc3\xa7I\x9c\xddC\xfbR&\xb2\ ! \x98\xb2?L\xe0gI\x9d|\xfa\x10\xc3\xaeE\xf4D\xefF>\xc0\xb6\x9a\xbe7\x93W\x93\ ! \xd5\n\xfbFK`Hd\x8aq\xc3\xdc\xf1V\xc4\xeb\r\xf0\x18\xca$\xa2\x11\x0f\xb1\xd8\ ! \xd2\xca\xbd!\x19\xf0\xc9\x13\xcd\xdd(\xe4YF\x8b\x84|y\xbf.d\x95E\xb0\xd2r"`\ ! \xa5s\x96\xf3(\x8a\xb3\xf5\xfcu\xfdx\x9c\x88\x15\xd08y}\xfe\xed7\xac\xfe\te\ ! \x11\x89\xe2X\xc9\x1c\xba\xf2GV\xca$\x8eX\x98\x00\xcd\xa1q\xa1TJ\xa6\xbd\xa1\ ! \xc47\x9e61D<\x97\xf1?\x04\x8c;;{\xd1\xdav#\xe5\x93\xd7\xd4\x91\x8a\xb2\xe4k\ ! \xa1\xcfK\xa1@;\xfc\x8a\xc7f\x0b\x85\x96\x03\xb4\xd1\xdcR,Q\xf3\xecYk\x1e]\ ! \xe6\xfe\xf0\xfd\x04\xff\x9d\xb3m\x1c\xa9\r\xe8\xc1\xeb\x17\xeeD\xe4\x18\xc8\ ! vd\xb8Z\xad\xb8\x98\x9c\x0f\xf0\xe1\n\xe47\x05c\xd7\x1e\x9a4|\xf0\x86+\xa3\ ! \xf1fy 9\xc9\x1f\xcd\x99\x17\x02\x95C\x0b\xbad\x83\n:!\x15\xec\x8d\x1c\xd6W3\ ! \xb8\xd1\xb8g(:\x83\x9e#\x86\xfbPE\x9c\x8b\xcf2\xd3\xe7\xd9\xd2O\xb1\xc4\x7f\ ! f`\x15_\x03R\\\x0f\xe9\xcd\xe9\xe4\xc5yM\xfd{\xa2~>,\xb6\xd9\xd8B\xc5ll\x00\ ! \x07\x1eI\x88\x888\xf8\xe0Q\x1b\n~\x16\xc5\x0fl\x99\xf0\xb2\x9c\xfb\xda\x80\ ! \xfc\xc0\xf4\x949\xcfh\x06@C*\x94\x17\xcc\xe2t\xcd\xcab9\xf7u\xcbh\x1d\xaf|\ ! \xc6\x93x\x9d\xcd}\x1e\x96i\x1cE\x89\xf0\xc7\xc1\xcb,,\xf3s\xe0\x03(\xb4\xa8\ ! \x99\x85\xf4)\xd8\x85\xda\x8b\xe5\x85\x8c\xaa\xa5\xfa\xc8\xd3\x16,B\xf3\xe3\ ! \xceP\x9c:\xf3\xb8\xe6P\xa6\x02\x8f\xc2c\x9bB\xac`\x01x\xf7\x83k\xf8=\x1b\ ! \xf3\xe1e\xc0\xc42Z\xe3\xe5Z\x9d\xb3*\x1e!F\xb7Yn\xbf\x80\xa0\xb4$QbH"\xe5qv\ ! \xa1\xad\xd83;\xf3\x8cUk\xf9\xce6\xaf\x02@r\xbd!\xd0c8\x8eW\x8d\xe0\xf3`\xb1\ ! \x89K\xb6\x8a\x13q\xc4fK\x19\x89\xa0\xe6\x82\xde\x8eX$Vq\x06\x9bW\x1b\xc1\ ! \x12)\xef\x8fy\x16\x1d\xaf\x84H4\t\xb9\xa2\x9e\n=Fl=\x86mmDw\'\n\x00\xdf\x11\ ! c\x0bh~\x00\xd4\x97U\xa9\t\xe41\xe8^\x893\xc8\x7f\xe9\xe5"p\x05\x85\x00\xef \ ! \x98xT\x05_\xa2b\xc3\xc2\x80SY\x9c\'U\xc2\xa9Ai\x12E\x95\xa98\x05\xa0\x92,\ ! \xda\x81@\xc1\x81$\xc9\x8e\xe9S\x14\xc4\t\xd1V\x1b\xae\xda|\xe9\xf9\x9a9V\ ! \xe2\x9f\x92U9;\x86QF,\x0c\xd6\x06S\x85\xff\xc0\xf6\x13\xa1\x94\xb0\xac\xea\ ! \xb9K\x99\xe62\x03q\x97\xb09-\xc1\x9b\xdd\x07\x91D\xef\xc1\x18\x8d\x10Y\xac\ ! \xc5\x97\x02?\x88\xa7\xc8\x060\xf7\x10G\x96\x03\xcb"@\xfe\xb1\x0c\xff\x0e\ ! \x90\x03;\xcds\x84\xc1\x04\xf8,\xf4\x9c\x94\xdf\x03\x7f\xb07\xcd^.\xcb2\x06\ ! \x90\x1di\x1aW|\xb9\xd1\xd2\xac\x85I\xb32!\xa2\x12e\x13\n->#\xbd\rn*3\x8a@|\ ! \xc6\x91eX\xf1\xf5\x11\xc9;\x14\xb0A\\\x94\xdd\xec\xd4\x06\xc0\xc3p\x07t\x8d\ ! \xd4G\xb3q\xee*\xd4[\xa4\n\xa7\xc6\xd3[\x8a$\x81\x99KLv\xfc\x06\xb3T\x11\xcc \xfa\xb4\xe4\xa0\xd7\xeb\xcc7\ ! \xcd>Z\x0f.=\xf7O!\x86\xa2\xa5\x19\x01\xbf\x8a\xc0\x8a\x80N\x8f\xa2\x1f\x83"\ ! \\\xc0\xba>Z\x04\x85\x83\x90?\xf9\x1d\xfa\x18x\xfa:\xf1\x00\xd2\x137\xfc\xd3\ ! \xe6\x07\xd1\xa5%\xe6\xeb8\xb3T\\U\xa5\x8e3\xcd\xc4W\x13\x1f\xdc\x18F\xbc\ ! \xf4l\x83M\'\xc6C>\x9dW\xb3i\xd8A\'Xk\xf1\xe5\xcc\xa0\xb0\x0c\x9d$s\x04\x0fV\ ! \xf0A\x96j8\xe6*m$\x05\xda\xd3\xd0\x01\xe3/a9\x90\x1eX\x12\x9a\x82\xd5I\x13\ ! \x8a5s\xc0\x9a\xdb\xa3\x1d\xd5\xd5)\xd5#\x0c\xd4\xce;\x1b\x8c\x15\x11\x02\ ! \xf8\x83\x04W\x17Uy\x02\xfe\x117fc\x91\x91\x85 {\x80\xf0\x0bu\xc5\xec\xd9D\ ! \xeba1\xee\x06\xbe{h\xae>\xa1\x96\xdaj\xddD\x99\xeb\xce\x05\xbe{&/\x9c{\x00\ ! \xa1\xae\x8a\xdaw\x9b\x9f\xc3{[\xc1\x06\xc3wM\xd9{*\x83x\xdc]B\x02\x1d\'\xad\ ! \x117\x9fn\xcet\x1e\x81\xf1S\x86H\x022\x9a\x85Ag\xea\r\xf8\xee\xf2\x8e\xc0\ ! \xd3\x0bNNN&F\x1e\xb3q\x18\x1c\xb9\xc7\x0bc\t\xccd\x9b\x08\xc5\xb2EM\xc1x\ ! \x11H\xf8S\x87\xd0\xc8UX\xdd\xda\x1c\x00\xfe\xbc\x05\xdd|\x10\x9ai0\x08\xa4H\ ! \x87ZN;f\xe3.\xcdi\xd2\x9d\x80\xe0\x14Fz\x81\xcb\xfa\xa8\xb3@\x9f\xb2\x06\ ! \xedR\xcf~n\x19%\x15O\xf6[\xe5*\xc5c\xd0\xba\x13\xafbtV{.\x92U)\xe6\n\x1d\ ! \xfa\x0c\x9e\xd2\xa3\xe7\xa7]\xf7gm~s\xd2OYY\x15\xa23\xad\xa2\xc6\xee\x9e\ ! \x16\xb8}&\xf4\xce\xc0ua\xd2\xd4\xd9\tr>\xed\xac\x81\x9c\xb7\xe9\xb7\xe7\\\ ! \xf7\xa7l\xba3:\x9ctR\xd4\xdaX\x10\xf4L|k\n=o\xc4\xa3o\xa0\xd2\xfd\xdd\x99\ ! \xe3\xa2\xb7\xbbL\x9d\xf8\x83\xbe\xac\xe2\xf5s\x80l\xeb\x01X$\xf2]^\xfb\x8b\ ! \xf7\x7f\xff\x0c\xa1\xff\x12\xeb\x04\xb4\x0c\x88\x1e\x93\x81\xa2\xc9\x19;\ ! \xdb\xdf\x87d\xb9+\x95H\x01\xb0M\xdc\xd1\xd1\x02n*\x06zE?\xb80+k\x00\xc5\x80\ ! \x0b+\x08#WN\x8e\x0b\xacq\xf47qR\x97\x9f\x16\x105\xb6\xc3\x0e:\xe7\xa6\xd3\ ! \xe2\xd5\xa2\x95\xc1k\xd0*\x95,\xea\x1c\x14\xf2I]\'\x04W\xa5\xc0\x9cDi\x10\ ! \xc5\x8a\x90\xf4\x92\x06\xd6\xb6\x07\xd1\x1f/\xc1\x02\xb1\x0e\xb5\x91eCB\xcf\ ! \xacE\xd4HE3\xe6\x07\xb7\xf4\xb7\x1e\x8f"!\xd9\xd4\x9ee\xcf8\xc7l\x14\x856\ ! \x1c\xe9\xe8\x01$\xd5\xda\r`\xa1\x04\xdc!\x85f\x90\xf7j{3\xe6w\xc4\xb6\x9b\ ! \x18b\xd5\x9d\xd95\xf8ef\xc3\xc7!\t\x80\xeb\xbd\xd8\x88\xe5=5C\x96_H\xc8\x07\ ! 0\xcb\x0c+\xa5@.+ib9\x81\x19=\xadq\x84c\xb1t\x15\xc34\x7f\x81D}\xc8\xcd!X\ ! \x04b\xfe\xa5X\x89\xc2\x87\xc4\x95?\xe8B\x8d\x9en$E>\xfb\xc8\x14\x00,\xe7\ ! \xb0\x0c&\xb6\xc4\x0b\x11mb\x03=\xf92.\x97\xbc\x88\xd8\x98\x11u\xf8\x0b\xd0\ ! \x00\xbfQ!\x98\xad`\x02\xcd%\xed\xa4\x89v\xf5l\xbd\x93R\x87\x11\x1cu\x83\xa2\ ! \x1c\x1b\x84\xace;\xf0\xa5R?\xa7!\xf5y\x03\xfb\xa0 \x10\x8a\xfd\xf9j\xe1\xe0\ ! \x80I\xf6)\x01\xf27q\x14A\xe0\xc2\xb0\x886\xf7A\x8a\x12\xc4\x80\xe1\x9by\xd4\ ! \xa9\x91?q1\xe0\x19\x02\x19(\xbf\x9e\xaf\x9f\x06\xa7\xeb\xc8\xc2\x04\x12\xd0\ ! \xd7\n\x1c\xec{?\xf6ub\x89\x16\x07e\x15\xa6\xb1\xb2\x1c\xac\xa5\xc3\xff;\x12\ ! \xa2?T\xaa\xc6\x1f\xc3\xdd\r\xcaKV\x90\x92q\x88l\xa3\xb8D\xfe\x80\xc4\x89S\ ! \x03m\x050\x1af\xf7\xe1\x02\xa5\xb0\x1f\x13\x1fa\xe4\xd72\xf0\x14Xvc\xe7~\ ! \xb0\x88it\x91\xb6"\xc7\xc6\x8a\x97E\x9c\xdbma\x82<\xfe;\x7f\xe0\xba\xd5\x0f\ ! \xc6\xe3\xd9w\xbf\\\\\xbe]\xbc\xfd\xc5$\x9bU\xa65Sf\x1a\x1f\x0ep&]HA\xfa\xb9\ ! \x90\xa6H\xf1\xa5\xe1\'^\xb1\x03\xc8\xdc\xab\x14\x02\xe5\x11\xf2Q\xb2\x97/Y\ ! \xbbe\x94\x88l\ryY0g\xa7\x87\xcd\xd4/m)\x92\xd6\xcf;S\x7f9\xf9\xf5\xbc7\x8c\ ! \x1d\xc40pr\xceb6\xa3if\x01h\xf8\xe3\x1f\x0f\xdb\xc3\xbf\xf4\x8f\xaa\x84h\ ! \x9c\x8a\xd9@\x04g\xff\x12\xff:\xc2\x93\x1eQ\xc7\x81?\xf5\x0f\x07.Dp\xa3\xf5\ ! L\xbb\xa3\xf9\x9c\x9d\xe1~\xeb\x0e\xe0\x17\x1bQj\xd0>\xac&vMS.\x99\xd7\xd2\ ! \x1d)\xf9^nEq\x01.\xe1\xe0\xf0\xb0?\xfb\xcb\xf3\x04\t|\x00\xcd`\xfd\xa2\x12\ ! \x03{\xf8g\xbb\xc9y5\x8f\xe6\xcfx\xfc\xeb\xafM\xe2A\xca\xb2\':\xdd|\xba[\xf8\ ! A7\xeb\xf0m\x1e\xac\x93<\xf7:m_\xc0\x00\xb4p\xbc\xd1]\x15Z\xff\xd5J+!\x18\ ! \x0b>X\'\xec\x04\xb8\xbcdxY<\xa5x\xad\x97\x8c\xe2\xac\x1f\n\x99>\xd1m\xd9n]W\ ! \x018\xc8m\xc1s4l\x9c?\x1c\xc3x\x8d\xb5M\x1b\x9b\xf2\x91\x17\xff\x08<\x95\ ! \xf6*\xa0o^`\x9e\xc9\x8f\x8f\xffo\xd4\xc8\xef\x11-|\xfa\x1d\x94\xc0\xb7\x11\ ! \x1d\xf8\xfb;\xa8\xa0o$2\xf8\xd0\xbe\xf4\xd1\x9a\x15\x06\x83%\x82vA\xc39\xf5\ ! [\xb9m\n\x05\xf6\xfe\xce\x1f>,{\x13\xe8\xf7O\x86\x12\xc2\x8a\n\xf7\xe6r\xca\ ! \xc3\xd1^\xd07\x98[1ub>\xbc\x01\xd0\xf5[V\xc8\xe5\xfdw\x8c}\x7f|\xd8\xb9\xcd\ ! \x1a\xaet`\x11\xef\xb3\x17\xdcB\\\x14\x0bv\x1dg\x11\x18\x06V6\x0bj\xf9o\xc0;\ ! \x88y2LHu\xbd\xb2G\xc5\x84\xf0K\xacG\x14~\xaf\xc2\xd2s\xe5\x05\x8fbi=\x98\ ! \xb1\x84\xdd\x94\x8c\xe0/W?\x0f;0\xe44\xd2jh\xeb\xa8\xbey\xaf\x9dW\xff\xf7\ ! \xbf\x83\t\xd4_\xaf\xf1\xa7\xf4\xfe\x9fe\x01\xd3<\xd7\xa5\xc3\xfb\x7f\x96\ ! \x01JM\x1d\x06\xf0\xdd\x1f\xefm1a\xc2\xb3{4\x98\xa0\t.t\xb5\xe9\xf9\xd7>%\ ! \x8a\x91\xf7\xa0\xb4\x87\xbevr\xc5\x9e\xacl\xdce\xb6\xac\x03\xfb=\xf6\xfct8\ ! \xf45u\xb3*O$\x8f\xfa\x918\nA\xf7y\xb5\xdf#\xa1\xb4\x0e\xaf\xe5\x01\x99\x808\ ! \x8a\xb6\x94V\x89\x8as^(\xe2\xea8\xe2\x8a[H\xba\x8a\xa9\x98\xaeI3^\'(M\xea%\ ! \x8b\xcf)^u\x04\x10\xf3\xa4T\xef\xa5\xddP\xed\xbe}\xdb\xa7\x17\xc3v\xabj\xfa\ ! \xd9H\x12\x84\xe8\xc8\xfeSA\x17\x12\x82\x92\x8f\xedF&u\xda\t\x91U\xb6\xac\ ! \xa8\xa2o\xfc\xdc!\xe5KS\xb7\x14\x89a$$~\xdc,\xa4(9(\xe4\x16P\xf7LW\xaa\xe7>\ ! ^\xec\xc1\xb1\x98\x91\xee\xe2\xce\xf5\xa4>\xee\xcfZ\xd3\x87"oJ$\xdb\xea\x80\ ! \xae\xfc\xce5\x85>\xbd\xcdW\x92\xbb~\x96\x9a5\xd9}H^\x98\xb1\r9\xab\x8bu\xde\ ! |\x80e\x04s\xe4\xa4]k\xbc_\xad\xb0\x06\x80\x91m(\xf1\xfe\x029\xa3[\x95,\xb2Q\ ! \x0c\x955\x8f\x9djI\x9c\xb9\xb9rS\xb8f\x85H\xe5\x03}\x0eP\xdfI\x1f\xee]\r\ ! \xd8B"\xf7\xd7J\x14\xbba3\xa8\xbb\x1bK\xc0\xa6\xff\xc5&\xbf/B\xec\xf3\xac\ ! \xa4\xe0\xb9\xd1G\'#\xf1\xf5E\x9e\x7f6\xe9\x9e\xc20(@x\x08\xeaj.\xd6\xecMz4$\ ! \xf2\xbdv{\xa7\xb8*\x87+\x1fu\xb7\r\r>Vi\x08\x16\x0b\xa9=\xeal]{\xe9\x94\x08\ ! \xb1o)+\xfc\xe4\xe4\xe4\xf4\xec\x89BlCj\xf34\xa5MM\xe8\xd5\xeb7O\x10\xba)d\ ! \xc8\xc3\x18\xf2\x8f\x9d..4@\xb2\xd4\x17\xcc\xba\x84d\xe4\x84\xb7(\xc8\xa0\ ! \x85\x8f.\xe3\xa0R\xa1\x17LF\xff\xf5\xfd\x9f\x9eX\xf0k \xd5*\xef\xad(\x01\ ! \x07\x9f\x10sg\x90Sd\xa2\xeaJ\xdel\x10\xc5c/\x05l\x1b2Kb\xc6\xd0=\x0f\xfa\ ! \xc4+Q:\xd7t\xf8C6\xa8\x93\x13\x90?\x8d`ki\xaes\x8eX\x12\xdf\x0b-\xb0\x10\ ! \xf5\x180\x82\xd5\x97N\x87\xee\x87N\xcd\xad\n\xd1X\xb8%\xb2fP\xd1\x8c /\xda\ ! \xf7\xf86J\xc41\x9e-\x12w?\'\x04@\xbd2_\x91\xe09\xf6\x8b\x08-RC\x02\xeaMp\\\ ! \xe8\x90\x1bu\xeb\xd8yp+TUdM\x1d\xb2\xfd=\x17\xdec\xdbs\x04\xd8\xa2k\xe2\xe9\ ! \xf0y\xe8Ao\xf5\x18O\x03\xa2\x05N\x03\x89\xf6,\x0e\x87?\xfe\xdaG\xf1\xa8f\ ! \xfc\x03P{B\xe7\xea\xfeZ\xdd~\xf8t\xfb\xc1\xa2\x9aG\xd7\xe2B\xe6\xa0\x8e\xd6\ ! \xb3{\xe8\xd9\xbd\xde\xc7bXs\xc5\x82\x1f\xf8?*yR\x11\x10\'S=\x13(\x98k\x1c\ ! \xae\xec\xd5<~\x02d\xbe\xb4r\xcb\xc9\x80\xdf\xa5(uu\x9d\xaa\x9c#\xaa\x1c\x17\ ! 5\rN_\xc8\xc8\x82\xee#\xfb\xe6\xeb\x9bq7\\m\x10r\xfd`l\xee\xb9\xc6\xb9ijY\ ! \xf4o\xc8\xe5\xa2\xf9l\x8e\x86,\xde\xbe{\x7f\xe5\x8c\x19\xbeB\x1c\xb82\\\xdc\ ! :\xb3\xc0\x02NZ9\xfb\xe2\xd2\x84\x8a\x1e)\xba\xe7\xa6\xd5\x1dm\xad\x99Lx(\ ! \x12/x\x8f\x7f\xec\xf7\x87\xfd+\xe7\xc5e{\x9d.\xb9\x1f?\xde\xfc\xb4h}:\xa2?(\ ! 9}\xe3\xd1*\xe4\x83\x8c\xf7\xd2\t\x1c\x84q\xe9g\xc0\xd3\xcf)\xcf\xbdvX\xda]\ ! \x8c\x0ed#\x92\x1c\x8ex\x89\xd5h\x8c\x8f\xe8\x02\xffd2\xf1\x83\xe7\xefG\x9e!\ ! \xa7o\xf4[\xc4\x9e\xba!\xb2\x01\xc75Lc\xb8\xc1\xc6\xael@0\xc0?<\xde>ur\xa7\ ! \xdd\x93k\xce\xca\xc6\xfbt\x88~pQ\x15\x05~I\xfd7\xf4\xd9\xd3\x81\xfd8\xa7l\ ! \xf6\xdc\xa8\xa0\x9eKS\x01\x1e\xc8\xed\x1f\xd6\x9a\xfb\x1c\xa3N\xce\x03\xc3\ ! \x1cY\xbac\xc7\xa4\xc8}H\xd1_\x8bP\xbaR\xf3\xea\xc6!&\x1e\xb4\n\xb1\xc1Z\x84\ ! \xf9\xd6\xe8\x8e?\x08\xafq\x8f5\x11\xdb\x80\xc8\xd2\xbc\xb6p\xa6\x10\xda\xa2\ ! \xc5\x8a\x93\xef\xeb\x81\xcd\xef`\xebV\xd3\xc6\x1b\x0bM|\x80\xc5\x16\x83_\ ! \xfd\xa1\xc1\xa6R\x91\xdcf\xa6\xe26p\x8b\xd6\x19\xe1a\xa5\xec\xce\xb4\x11\n\ ! \xb1?K\x19\x85\xbb\xe6\xb3I\xb3$~(m\xbf\xce\x82\xe7:\x00]I\xa9\xb04`\xa3\xcf\ ! \x12D\xdf\xaf>\x1a\xcflnC\xc1H^\x0c\x95\x15gN\xaa\x1b\r\x7fp\xddT\x8c|\xbd2\ ! \xfa;\xbf\xe5\x01\x1b\x0c\xbf\xd1\xdfCs\xf7\x8b\x04\xe7\x82\x1e\x02\xf1R\x01\ ! \x1e{\xc1\x07\x99\xc1\xa1,\xd9\xd9\x84\x9d\xbc\x9aN^M\xcfN\xd9\xe9drjd?j\xad\ ! m\xd6R*\x9f\x8e\xc7\xdb\xedvT\xda\x05G\xb2X\x8f\x1d\x06\xf0\x1d\x97\xef\\ \ ! \xb7,\xb3\xb3\xcd\xd6!\x11\xa2#\xc0\xb4\xcc|(\x14Gy\xd8Y\xe6F\xc4D\xf9\xa0\ ! \x7f\xfd\x92\x86\x89\xdc\xd1L\xd8K\x90\xc09\xb3\x931/\xd5\xdc\x9aX\xa4\x89A\ ! \x9c8\x1e4EF;\xfd\xf1\xbc\xfe\xbfv\xfe\x05\xfb\x91lY' ) ### end From richiehindle at users.sourceforge.net Fri Jan 24 15:59:24 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 24 18:59:28 2003 Subject: [Spambayes-checkins] spambayes/spambayes OptionConfig.py,1.1,1.2 Options.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv852 Modified Files: OptionConfig.py Options.py Log Message: The configuration page now tells you where your ini file is. It also respects the rules in Options.py for finding the ini file. I've pulled out all the code for running OptionsConfig.py as a standalone script (this is part of moving it into the spambayes package from the scripts area, but I've done it in stages to minimise lossage through abusing CVS). Index: OptionConfig.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/OptionConfig.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** OptionConfig.py 24 Jan 2003 20:51:31 -0000 1.1 --- OptionConfig.py 24 Jan 2003 23:59:22 -0000 1.2 *************** *** 6,21 **** This module implements a browser based Spambayes option file configuration ! utility. Users may use the pages in this application to customize the ! options in the bayescustomize.ini file. ! ! This does not support the BAYESCUSTOMIZE environment variable. Is this even ! used anywhere? ! ! By default, this module forms a part of the web user interface provided by ! pop3proxy.py. You can also run it standalone, but only for historical ! reasons. To do this, just invoke OptionConfig.py ! The port number is the port the http server will listen on, and defaults to ! 8000. Your web browser should launch automatically; if it doesn't, then ! point it to http://locahost:8000 (or whatever port you chose). To Do: --- 6,11 ---- This module implements a browser based Spambayes option file configuration ! utility. Users may use it (via pop3proxy.py) to customize the options in ! the bayescustomize.ini file. To Do: *************** *** 38,48 **** from spambayes import Dibbler, PyMeldLite ! from spambayes.Options import options ! import re ! import os, sys import ConfigParser - import copy - - IMAGES = ('helmet', 'config', 'status') # This control dictionary maps http request parameters and template fields --- 28,34 ---- from spambayes import Dibbler, PyMeldLite ! from spambayes.Options import options, optionsPathname ! import sys import ConfigParser # This control dictionary maps http request parameters and template fields *************** *** 160,184 **** class OptionsConfigurator(Dibbler.HTTPPlugin): ! def __init__(self, proxyUI=None): Dibbler.HTTPPlugin.__init__(self) - - # Store the proxy UI; this won't be given when we're standalone. self.proxyUI = proxyUI ! # Load up the necessary resources: ui.html and the GIFs. ! from pop3proxy import readUIResources ! htmlSource, self._images = readUIResources() ! self.html = PyMeldLite.Meld(htmlSource) ! ! # Adjust the HTML according to whether we're running standalone or as ! # a part of the proxy. ! if not self.proxyUI: ! self.html.productName = "Spambayes Options Configurator" ! self.html.footerHome = "Spambayes Options Configurator" ! self.html.shutdownButton.value = "Shutdown Configurator" ! else: ! # "Save and Shutdown" is confusing here - it means "Save database" ! # but that's not clear. ! self.html.shutdownTableCell = " " def onConfig(self): --- 146,157 ---- class OptionsConfigurator(Dibbler.HTTPPlugin): ! def __init__(self, proxyUI): Dibbler.HTTPPlugin.__init__(self) self.proxyUI = proxyUI + self.html = self.proxyUI.html.clone() ! # "Save and Shutdown" is confusing here - it means "Save database" ! # but that's not clear. ! self.html.shutdownTableCell = " " def onConfig(self): *************** *** 195,199 **** bcini.set(sect, opt, options._config.get(sect, opt)) ! bcini.read('bayescustomize.ini') # Start with an empty config form then add the sections. --- 168,172 ---- bcini.set(sect, opt, options._config.get(sect, opt)) ! bcini.read(optionsPathname) # Start with an empty config form then add the sections. *************** *** 201,204 **** --- 174,178 ---- html.mainContent = self.html.configForm.clone() html.mainContent.configFormContent = "" + html.mainContent.optionsPathname = optionsPathname # Loop though the sections in the `page_layout` structure above. *************** *** 253,272 **** html.configFormContent += section ! # Customise the page according to whether we're standalone or a proxy. ! if self.proxyUI: ! html.title = 'Home > Configure' ! html.pagename = '> Configure' ! else: ! html.title = 'Home' ! del html.homelink ! html.pagename = 'Home' ! self.writeOKHeaders('text/html') self.write(html) - # Implement `onHome` for the standalone version. In the POP3 proxy, the - # proxy UI's `onHome` will take precedence over this one. - onHome = onConfig - def onChangeopts(self, **parms): html = self.html.clone() --- 227,235 ---- html.configFormContent += section ! html.title = 'Home > Configure' ! html.pagename = '> Configure' self.writeOKHeaders('text/html') self.write(html) def onChangeopts(self, **parms): html = self.html.clone() *************** *** 283,298 **** updateIniFile(parms) html.mainContent.heading = "Options Changed" ! if self.proxyUI: ! html.mainContent.boxContent = OK_MESSAGE % "Options changed" ! self.proxyUI.reReadOptions() ! else: ! html.mainContent.boxContent = """The options changes you've made ! have been recorded. You will need to restart any Spambayes ! processes you have running, such as the pop3proxy, in order ! for your changes to take effect. When you return to the ! Options Configuration homepage, you may need to refresh the ! page to see the changes you have made.""" html.title = 'Home > Options Changed' html.pagename = '> Options Changed' --- 246,253 ---- updateIniFile(parms) + self.proxyUI.reReadOptions() html.mainContent.heading = "Options Changed" ! html.mainContent.boxContent = OK_MESSAGE % "Options changed" html.title = 'Home > Options Changed' html.pagename = '> Options Changed' *************** *** 302,318 **** def onRestoredefaults(self, how): restoreIniDefaults() html = self.html.clone() html.mainContent = self.html.headedBox.clone() html.mainContent.heading = "Option Defaults Restored" ! if self.proxyUI: ! html.mainContent.boxContent = OK_MESSAGE % "Defaults restored" ! self.proxyUI.reReadOptions() ! else: ! html.mainContent.boxContent = """All options have been reverted to ! their default values. You will need to restart any Spambayes ! processes you have running, such as the pop3proxy, in order for ! your changes to take effect. When you return to the Options ! Configuration homepage, you may need to refresh the page to see ! the changes you have made.""" html.title = 'Home > Defaults Restored' html.pagename = '> Defaults Restored' --- 257,266 ---- def onRestoredefaults(self, how): restoreIniDefaults() + self.proxyUI.reReadOptions() + html = self.html.clone() html.mainContent = self.html.headedBox.clone() html.mainContent.heading = "Option Defaults Restored" ! html.mainContent.boxContent = OK_MESSAGE % "Defaults restored" html.title = 'Home > Defaults Restored' html.pagename = '> Defaults Restored' *************** *** 320,348 **** self.write(html) - def onSave(self, how): - # Really 'shutdown'; this is the button in the footer, not on the - # form. Again, the proxy UI's `onSave` will override this one when - # we're running as part of the proxy. - html = self.html.clone() - del html.helmet - del html.homelink - html.shutdownTableCell = " " - html.mainContent = self.html.shutdownMessage - html.title = 'Home > Shutdown' - html.pagename = 'Shutdown' - self.writeOKHeaders('text/html') - self.write(html) - self.close() - sys.exit() - - def _writeImage(self, image): - self.writeOKHeaders('image/gif') - self.write(self._images[image]) - - # If you are easily offended, look away now... - for imageName in IMAGES: - exec "def %s(self): self._writeImage('%s')" % \ - ("on%sGif" % imageName.capitalize(), imageName) - def editInput(parms): --- 268,271 ---- *************** *** 370,375 **** sco = float(sco) except ValueError: ! errmsg += '
  • Spam cutoff must be a number, \ ! between 0 and 1
  • \n' # edit 0 <= hamcutoff < spamcutoff <= 1 --- 293,297 ---- sco = float(sco) except ValueError: ! errmsg += '
  • Spam cutoff must be a number, between 0 and 1
  • \n' # edit 0 <= hamcutoff < spamcutoff <= 1 *************** *** 410,416 **** def updateIniFile(parms): ! # assumes bayescustomize.ini is in this process' working directory ! ! inipath = os.path.abspath('bayescustomize.ini') bcini = ConfigParser.ConfigParser() --- 332,337 ---- def updateIniFile(parms): ! # Get the pathname of the ini file as discovered by the Options module. ! inipath = optionsPathname bcini = ConfigParser.ConfigParser() *************** *** 440,446 **** def restoreIniDefaults(): ! # assumes bayescustomize.ini is in this process' working directory ! ! inipath = os.path.abspath('bayescustomize.ini') bcini = ConfigParser.ConfigParser() --- 361,366 ---- def restoreIniDefaults(): ! # Get the pathname of the ini file as discovered by the Options module. ! inipath = optionsPathname bcini = ConfigParser.ConfigParser() *************** *** 458,476 **** bcini.write(o) o.close() - - # - # Running this standalone is no longer required, and doesn't work out of - # the box. The code's here for reference only. - # - - def run(port): - httpServer = Dibbler.HTTPServer(port) - httpServer.register(OptionsConfigurator()) - Dibbler.run(launchBrowser=True) - - if __name__ == '__main__': - if len(sys.argv) > 1: - port = int(sys.argv[1]) - else: - port = 8000 - run(port) --- 378,379 ---- Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** Options.py 22 Jan 2003 05:23:17 -0000 1.8 --- Options.py 24 Jan 2003 23:59:22 -0000 1.9 *************** *** 10,14 **** import StringIO import ConfigParser ! try: from sets import Set except ImportError: --- 10,14 ---- import StringIO import ConfigParser ! try: from sets import Set except ImportError: *************** *** 545,548 **** --- 545,555 ---- return output.getvalue() + + # `optionsPathname` is the pathname of the last ini file in the list. + # This is where the web-based configuration page will write its changes. + # If no ini files are found, it defaults to bayescustomize.ini in the + # current working directory. + optionsPathname = None + options = OptionsClass() *************** *** 555,559 **** alternate = os.getenv('BAYESCUSTOMIZE') if alternate: ! options.mergefiles(alternate.split()) else: alts = [] --- 562,568 ---- alternate = os.getenv('BAYESCUSTOMIZE') if alternate: ! filenames = alternate.split() ! options.mergefiles(filenames) ! optionsPathname = os.path.abspath(filenames[-1]) else: alts = [] *************** *** 564,565 **** --- 573,578 ---- if alts: options.mergefiles(alts) + optionsPathname = os.path.abspath(alts[-1]) + + if not optionsPathname: + optionsPathname = os.path.abspath('bayescustomize.ini') From montanaro at users.sourceforge.net Fri Jan 24 20:35:42 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Fri Jan 24 23:35:46 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.41,1.42 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv28358 Modified Files: pop3proxy.py Log Message: * UserInterface: add onUpload method from proxytrainer for use by proxytee * onReview: make sure judgement is just the "spam" part of "spam; 1.00" Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** pop3proxy.py 24 Jan 2003 22:45:09 -0000 1.41 --- pop3proxy.py 25 Jan 2003 04:35:40 -0000 1.42 *************** *** 651,654 **** --- 651,685 ---- self._writePostamble() + def onUpload(self, params): + """Save a message for later training.""" + # Upload or paste? Spam or ham? + content = params.get('file') or params.get('text') + + # Convert platform-specific line endings into unix-style. + content = content.replace('\r\n', '\n').replace('\r', '\n') + + # Single message or mbox? + if content.startswith('From '): + # Get a list of raw messages from the mbox content. + class SimpleMessage: + def __init__(self, fp): + self.guts = fp.read() + contentFile = StringIO.StringIO(content) + mbox = mailbox.PortableUnixMailbox(contentFile, SimpleMessage) + messages = map(lambda m: m.guts, mbox) + else: + # Just the one message. + messages = [content] + + for m in messages: + message = state.unknownCorpus.makeMessage("%d"%self.messageName) + message.setSubstance(m) + state.unknownCorpus.addMessage(message) + self.messageName += 1 + + # Save the database and return a link Home and another training form. + self.doSave() + self.push("

    OK.

    ") + def onTrain(self, file, text, which): """Train on an uploaded or pasted message.""" *************** *** 884,889 **** cachedMessage = state.unknownCorpus[key] message = mboxutils.get_message(cachedMessage.getSubstance()) ! judgement = message[options.hammie_header_name] or \ ! options.header_unsure_string messageInfo = self._makeMessageInfo(message) keyedMessageInfo[judgement].append((key, messageInfo)) --- 915,923 ---- cachedMessage = state.unknownCorpus[key] message = mboxutils.get_message(cachedMessage.getSubstance()) ! judgement = message[options.hammie_header_name] ! if judgement is None: ! judgement = options.header_unsure_string ! else: ! judgement = judgement.split(';')[0].strip() messageInfo = self._makeMessageInfo(message) keyedMessageInfo[judgement].append((key, messageInfo)) From mhammond at users.sourceforge.net Fri Jan 24 22:51:04 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Sat Jan 25 02:35:59 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs FilterDialog.py,1.13,1.14 TrainingDialog.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory sc8-pr-cvs1:/tmp/cvs-serv21734 Modified Files: FilterDialog.py TrainingDialog.py Log Message: Ensure all default folders are "fully qualified". Index: FilterDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FilterDialog.py,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** FilterDialog.py 23 Jan 2003 12:17:35 -0000 1.13 --- FilterDialog.py 25 Jan 2003 06:51:01 -0000 1.14 *************** *** 87,91 **** if len(self.watch_folder_ids)==0 and mgr.outlook is not None: inbox = self.mgr.outlook.Session.GetDefaultFolder(constants.olFolderInbox) ! self.watch_folder_ids = [inbox.EntryID] self.spam_folder_id = mgr.config.filter.spam_folder_id --- 87,91 ---- if len(self.watch_folder_ids)==0 and mgr.outlook is not None: inbox = self.mgr.outlook.Session.GetDefaultFolder(constants.olFolderInbox) ! self.watch_folder_ids = [(inbox.StoreID, inbox.EntryID)] self.spam_folder_id = mgr.config.filter.spam_folder_id *************** *** 389,393 **** mgr.config = config = Config() config.filter = Config() ! config.filter.watch_folder_ids = [outlook.Session.GetDefaultFolder(constants.olFolderInbox).EntryID] config.filter.watch_folder_include_sub = True config.filter.spam_folder_id = "" --- 389,394 ---- mgr.config = config = Config() config.filter = Config() ! inbox = outlook.Session.GetDefaultFolder(constants.olFolderInbox) ! config.filter.watch_folder_ids = [(inbox.StoreID, inbox.EntryID)] config.filter.watch_folder_include_sub = True config.filter.spam_folder_id = "" *************** *** 398,402 **** config.filter.unsure_threshold = 20 config.filter_now=Config() ! config.filter_now.folder_ids = [outlook.Session.GetDefaultFolder(constants.olFolderInbox).EntryID] config.filter_now.include_sub = True config.filter_now.only_unread = False --- 399,404 ---- config.filter.unsure_threshold = 20 config.filter_now=Config() ! inbox = outlook.Session.GetDefaultFolder(constants.olFolderInbox) ! config.filter_now.folder_ids = [(inbox.StoreID, inbox.EntryID)] config.filter_now.include_sub = True config.filter_now.only_unread = False Index: TrainingDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/TrainingDialog.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** TrainingDialog.py 23 Jan 2003 12:17:35 -0000 1.9 --- TrainingDialog.py 25 Jan 2003 06:51:02 -0000 1.10 *************** *** 71,75 **** if len(self.config.ham_folder_ids)==0 and self.mgr.outlook is not None: inbox = self.mgr.outlook.Session.GetDefaultFolder(constants.olFolderInbox) ! self.config.ham_folder_ids = [inbox.EntryID] # If we have no known spam folders, but do have a spam folder # defined in the filters, use it. --- 71,75 ---- if len(self.config.ham_folder_ids)==0 and self.mgr.outlook is not None: inbox = self.mgr.outlook.Session.GetDefaultFolder(constants.olFolderInbox) ! self.config.ham_folder_ids = [(inbox.StoreID, inbox.EntryID)] # If we have no known spam folders, but do have a spam folder # defined in the filters, use it. From richiehindle at users.sourceforge.net Sat Jan 25 02:16:20 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Sat Jan 25 05:16:24 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.42,1.43 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv13465 Modified Files: pop3proxy.py Log Message: Made onUpload work, so you should now be able to use Skip's proxytee.py with pop3proxy.py. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** pop3proxy.py 25 Jan 2003 04:35:40 -0000 1.42 --- pop3proxy.py 25 Jan 2003 10:16:16 -0000 1.43 *************** *** 483,498 **** # Cache the message; don't pollute the cache with test messages. if command == 'RETR' and not state.isTest: - # The message name is the time it arrived, with a uniquifier - # appended if two arrive within one clock tick of each other. - messageName = "%10.10d" % long(time.time()) - if messageName == state.lastBaseMessageName: - state.lastBaseMessageName = messageName - messageName = "%s-%d" % (messageName, state.uniquifier) - state.uniquifier += 1 - else: - state.lastBaseMessageName = messageName - state.uniquifier = 2 - # Write the message into the Unknown cache. message = state.unknownCorpus.makeMessage(messageName) message.setSubstance(messageText) --- 483,488 ---- # Cache the message; don't pollute the cache with test messages. if command == 'RETR' and not state.isTest: # Write the message into the Unknown cache. + messageName = state.getNewMessageName() message = state.unknownCorpus.makeMessage(messageName) message.setSubstance(messageText) *************** *** 651,663 **** self._writePostamble() ! def onUpload(self, params): ! """Save a message for later training.""" ! # Upload or paste? Spam or ham? ! content = params.get('file') or params.get('text') ! ! # Convert platform-specific line endings into unix-style. ! content = content.replace('\r\n', '\n').replace('\r', '\n') ! ! # Single message or mbox? if content.startswith('From '): # Get a list of raw messages from the mbox content. --- 641,647 ---- self._writePostamble() ! def _convertUploadToMessageList(self, content): ! """Returns a list of raw messages extracted from uploaded content. ! You can upload either a single message or an mbox file.""" if content.startswith('From '): # Get a list of raw messages from the mbox content. *************** *** 667,684 **** contentFile = StringIO.StringIO(content) mbox = mailbox.PortableUnixMailbox(contentFile, SimpleMessage) ! messages = map(lambda m: m.guts, mbox) else: # Just the one message. ! messages = [content] for m in messages: ! message = state.unknownCorpus.makeMessage("%d"%self.messageName) message.setSubstance(m) state.unknownCorpus.addMessage(message) - self.messageName += 1 ! # Save the database and return a link Home and another training form. ! self.doSave() ! self.push("

    OK.

    ") def onTrain(self, file, text, which): --- 651,674 ---- contentFile = StringIO.StringIO(content) mbox = mailbox.PortableUnixMailbox(contentFile, SimpleMessage) ! return map(lambda m: m.guts, mbox) else: # Just the one message. ! return [content] + def onUpload(self, file): + """Save a message for later training - used by Skip's proxytee.py.""" + # Convert platform-specific line endings into unix-style. + file = file.replace('\r\n', '\n').replace('\r', '\n') + + # Get a message list from the upload and write it into the cache. + messages = self._convertUploadToMessageList(file) for m in messages: ! messageName = state.getNewMessageName() ! message = state.unknownCorpus.makeMessage(messageName) message.setSubstance(m) state.unknownCorpus.addMessage(message) ! # Return a link Home. ! self.write("

    OK. Return Home.

    ") def onTrain(self, file, text, which): *************** *** 693,708 **** content = content.replace('\r\n', '\n').replace('\r', '\n') ! # Single message or mbox? ! if content.startswith('From '): ! # Get a list of raw messages from the mbox content. ! class SimpleMessage: ! def __init__(self, fp): ! self.guts = fp.read() ! contentFile = StringIO.StringIO(content) ! mbox = mailbox.PortableUnixMailbox(contentFile, SimpleMessage) ! messages = map(lambda m: m.guts, mbox) ! else: ! # Just the one message. ! messages = [content] # Append the message(s) to a file, to make it easier to rebuild --- 683,688 ---- content = content.replace('\r\n', '\n').replace('\r', '\n') ! # The upload might be a single message or am mbox file. ! messages = self._convertUploadToMessageList(content) # Append the message(s) to a file, to make it easier to rebuild *************** *** 1136,1140 **** self.numUnsure = 0 ! # Unique names for cached messages - see BayesProxy.onRetr self.lastBaseMessageName = '' self.uniquifier = 2 --- 1116,1120 ---- self.numUnsure = 0 ! # Unique names for cached messages - see `getNewMessageName()` below. self.lastBaseMessageName = '' self.uniquifier = 2 *************** *** 1203,1206 **** --- 1183,1198 ---- self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer) + + def getNewMessageName(self): + # The message name is the time it arrived, with a uniquifier + # appended if two arrive within one clock tick of each other. + messageName = "%10.10d" % long(time.time()) + if messageName == self.lastBaseMessageName: + messageName = "%s-%d" % (messageName, self.uniquifier) + self.uniquifier += 1 + else: + self.lastBaseMessageName = messageName + self.uniquifier = 2 + return messageName From richiehindle at users.sourceforge.net Sat Jan 25 04:56:00 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Sat Jan 25 07:56:04 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.43,1.44 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv7658 Modified Files: pop3proxy.py Log Message: The POP3 proxy doesn't support pipelining, so it now filters out that capability from the CAPA response. This should fix François' strange errors whereby the classification headers were appearing the middle of message bodies, and messages were being split apart when they were stored in the cache. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.43 retrieving revision 1.44 diff -C2 -d -r1.43 -r1.44 *** pop3proxy.py 25 Jan 2003 10:16:16 -0000 1.43 --- pop3proxy.py 25 Jan 2003 12:55:58 -0000 1.44 *************** *** 261,265 **** 'STAT', 'DELE', 'NOOP', 'RSET', 'KILL']: return False ! elif self.command in ['RETR', 'TOP']: return True elif self.command in ['LIST', 'UIDL']: --- 261,265 ---- 'STAT', 'DELE', 'NOOP', 'RSET', 'KILL']: return False ! elif self.command in ['RETR', 'TOP', 'CAPA']: return True elif self.command in ['LIST', 'UIDL']: *************** *** 316,319 **** --- 316,325 ---- def onResponse(self): + # We don't support pipelining, so if the command is CAPA and the + # response includes PIPELINING, hack out that line of the response. + if self.command == 'CAPA': + pipelineRE = r'(?im)^PIPELINING[^\n]*\n' + self.response = re.sub(pipelineRE, '', self.response) + # Pass the request and the raw response to the subclass and # send back the cooked response. *************** *** 1296,1300 **** self.okCommands = ['USER', 'PASS', 'APOP', 'NOOP', 'DELE', 'RSET', 'QUIT', 'KILL'] ! self.handlers = {'STAT': self.onStat, 'LIST': self.onList, 'RETR': self.onRetr, --- 1302,1307 ---- self.okCommands = ['USER', 'PASS', 'APOP', 'NOOP', 'DELE', 'RSET', 'QUIT', 'KILL'] ! self.handlers = {'CAPA': self.onCapa, ! 'STAT': self.onStat, 'LIST': self.onList, 'RETR': self.onRetr, *************** *** 1333,1336 **** --- 1340,1355 ---- time.sleep(0.02) + def onCapa(self, command, args): + """POP3 CAPA command. This lies about supporting pipelining for + test purposes - the POP3 proxy *doesn't* support pipelining, and + we test that it correctly filters out that capability from the + proxied capability list.""" + lines = ["+OK Capability list follows", + "PIPELINING", + "TOP", + ".", + ""] + return '\r\n'.join(lines) + def onStat(self, command, args): """POP3 STAT command.""" *************** *** 1421,1429 **** proxyReady.wait() ! # Connect to the proxy. proxy = socket.socket(socket.AF_INET, socket.SOCK_STREAM) proxy.connect(('localhost', 8111)) response = proxy.recv(100) assert response == "+OK ready\r\n" # Stat the mailbox to get the number of messages. --- 1440,1463 ---- proxyReady.wait() ! # Connect to the proxy and the test server. proxy = socket.socket(socket.AF_INET, socket.SOCK_STREAM) proxy.connect(('localhost', 8111)) response = proxy.recv(100) assert response == "+OK ready\r\n" + pop3Server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + pop3Server.connect(('localhost', 8110)) + response = pop3Server.recv(100) + assert response == "+OK ready\r\n" + + # Verify that the test server claims to support pipelining. + pop3Server.send("capa\r\n") + response = pop3Server.recv(1000) + assert response.find("PIPELINING") >= 0 + + # Ask for the capabilities via the proxy, and verify that the proxy + # is filtering out the PIPELINING capability. + proxy.send("capa\r\n") + response = proxy.recv(1000) + assert response.find("PIPELINING") == -1 # Stat the mailbox to get the number of messages. *************** *** 1456,1461 **** proxy.sendall("kill\r\n") proxy.recv(100) - pop3Server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - pop3Server.connect(('localhost', 8110)) pop3Server.sendall("kill\r\n") pop3Server.recv(100) --- 1490,1493 ---- From montanaro at users.sourceforge.net Sat Jan 25 08:17:25 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sat Jan 25 11:17:28 2003 Subject: [Spambayes-checkins] spambayes proxytrainer.py,1.2,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv6109 Removed Files: proxytrainer.py Log Message: deleting - pop3proxy.py has subsumed all this functionality --- proxytrainer.py DELETED --- From montanaro at users.sourceforge.net Sat Jan 25 08:18:43 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sat Jan 25 11:18:45 2003 Subject: [Spambayes-checkins] spambayes setup.py,1.14,1.15 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv7145 Modified Files: setup.py Log Message: proxytrainer.py is dead! long live pop3proxy.py! Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** setup.py 24 Jan 2003 20:43:05 -0000 1.14 --- setup.py 25 Jan 2003 16:18:41 -0000 1.15 *************** *** 38,42 **** 'hammiefilter.py', 'pop3proxy.py', - 'proxytrainer.py', 'proxytee.py', ], --- 38,41 ---- From richiehindle at users.sourceforge.net Sat Jan 25 09:42:21 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Sat Jan 25 12:42:24 2003 Subject: [Spambayes-checkins] spambayes helmet.gif,1.1,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv32313 Removed Files: helmet.gif Log Message: Removed helmet.gif. proxytrainer.py, which Skip has just removed, was the last thing that used it. Things that need it should now import it from spambayes.resources.helmet_gif. --- helmet.gif DELETED --- From richiehindle at users.sourceforge.net Sat Jan 25 11:48:57 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Sat Jan 25 14:49:01 2003 Subject: [Spambayes-checkins] spambayes MANIFEST.in,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv24925 Modified Files: MANIFEST.in Log Message: Don't try to include *.gif - there aren't any any more. Index: MANIFEST.in =================================================================== RCS file: /cvsroot/spambayes/spambayes/MANIFEST.in,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** MANIFEST.in 24 Jan 2003 21:39:36 -0000 1.3 --- MANIFEST.in 25 Jan 2003 19:48:55 -0000 1.4 *************** *** 6,9 **** recursive-include utilities *.py *.txt recursive-include testtools *.py *.txt ! include *.txt *.py *.gif --- 6,9 ---- recursive-include utilities *.py *.txt recursive-include testtools *.py *.txt ! include *.txt *.py From mhammond at users.sourceforge.net Sat Jan 25 17:17:29 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Sat Jan 25 20:17:33 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs FolderSelector.py,1.11,1.12 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory sc8-pr-cvs1:/tmp/cvs-serv11159 Modified Files: FolderSelector.py Log Message: Remove 'None' from our list of folders (I'm not even sure how it gets there!) Index: FolderSelector.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FolderSelector.py,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** FolderSelector.py 23 Jan 2003 12:17:35 -0000 1.11 --- FolderSelector.py 26 Jan 2003 01:17:27 -0000 1.12 *************** *** 157,161 **** self.select_desc_noun = desc_noun self.select_desc_noun_suffix = desc_noun_suffix ! self.selected_ids = selected_ids self.manager = manager self.checkbox_state = checkbox_state --- 157,161 ---- self.select_desc_noun = desc_noun self.select_desc_noun_suffix = desc_noun_suffix ! self.selected_ids = [sid for sid in selected_ids if sid is not None] self.manager = manager self.checkbox_state = checkbox_state From mhammond at users.sourceforge.net Mon Jan 27 06:12:23 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Mon Jan 27 09:12:28 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.45,1.46 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv21556 Modified Files: addin.py Log Message: Prevent exception when Outlook Today is the main view, and hide the spam buttons when a non-mail view is selected. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** addin.py 24 Jan 2003 12:43:43 -0000 1.45 --- addin.py 27 Jan 2003 14:12:20 -0000 1.46 *************** *** 407,412 **** ButtonRecoverFromSpamEvent, (self.manager, self), Tag = "SpamBayes.RecoverFromSpam") - # Prime our event handler. - self.OnFolderSwitch() # The main tool-bar dropdown with all our entries. --- 407,410 ---- *************** *** 512,515 **** --- 510,515 ---- if not self.have_setup_ui: self.SetupUI() + # Prime the button views. + self.OnFolderSwitch() def OnClose(self): *************** *** 524,533 **** def OnFolderSwitch(self): # Work out what folder we are in. outlook_folder = self.CurrentFolder ! show_delete_as = True ! show_recover_as = False ! try: ! if outlook_folder is not None: mapi_folder = self.manager.message_store.GetFolder(outlook_folder) look_id = self.manager.config.filter.spam_folder_id --- 524,542 ---- def OnFolderSwitch(self): + # Yet another worm-around for our event timing woes. This may + # be the first event ever seen for this explorer if, eg, + # "Outlook Today" is the initial Outlook view. + if not self.have_setup_ui: + self.SetupUI() # Work out what folder we are in. outlook_folder = self.CurrentFolder ! if outlook_folder is None or \ ! outlook_folder.DefaultItemType != constants.olMailItem: ! show_delete_as = False ! show_recover_as = False ! else: ! show_delete_as = True ! show_recover_as = False ! try: mapi_folder = self.manager.message_store.GetFolder(outlook_folder) look_id = self.manager.config.filter.spam_folder_id *************** *** 545,552 **** show_recover_as = True show_delete_as = True ! except: ! print "Error finding the MAPI folders for a folder switch event" ! import traceback ! traceback.print_exc() self.but_recover_as.Visible = show_recover_as self.but_delete_as.Visible = show_delete_as --- 554,561 ---- show_recover_as = True show_delete_as = True ! except: ! print "Error finding the MAPI folders for a folder switch event" ! import traceback ! traceback.print_exc() self.but_recover_as.Visible = show_recover_as self.but_delete_as.Visible = show_delete_as *************** *** 573,576 **** --- 582,590 ---- # etc elements. We hack around this by putting the logic in # the first OnActivate call of the explorer itself. + # Except that doesn't always work either - sometimes + # OnActivate will cause a crash when selecting "Open in New Window", + # so we tried OnSelectionChanges, which works OK until there is a + # view with no items (eg, Outlook Today) - so at the end of the + # day, we can never assume we have been initialized! self._DoNewExplorer(explorer, False) From timstone4 at users.sourceforge.net Mon Jan 27 10:07:15 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Mon Jan 27 13:08:18 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.44,1.45 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv21811 Modified Files: pop3proxy.py Log Message: Changed -d and -p arguments to match hammie arguments of -d DBMfilename and -D PICKLEfilename. -p argument is no longer valid Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** pop3proxy.py 25 Jan 2003 12:55:58 -0000 1.44 --- pop3proxy.py 27 Jan 2003 18:07:11 -0000 1.45 *************** *** 17,22 **** -h : Displays this help message. ! -p FILE : use the named database file ! -d : the database is a DBM file rather than a pickle -l port : proxy listens on this port number (default 110) -u port : User interface listens on this port number --- 17,22 ---- -h : Displays this help message. ! -d FILE : use the named DBM database file ! -D FILE : the the named Pickle database file -l port : proxy listens on this port number (default 110) -u port : User interface listens on this port number *************** *** 1501,1505 **** # Read the arguments. try: ! opts, args = getopt.getopt(sys.argv[1:], 'htdbzp:l:u:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ --- 1501,1505 ---- # Read the arguments. try: ! opts, args = getopt.getopt(sys.argv[1:], 'htbzpd:D:l:u:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ *************** *** 1516,1523 **** elif opt == '-b': state.launchUI = True ! elif opt == '-d': state.useDB = True - elif opt == '-p': options.pop3proxy_persistent_storage_file = arg elif opt == '-l': state.proxyPorts = [_addressAndPort(arg)] --- 1516,1529 ---- elif opt == '-b': state.launchUI = True ! elif opt == '-d': // dbm file state.useDB = True options.pop3proxy_persistent_storage_file = arg + elif opt == '-D': // pickle file + state.useDB = False + options.pop3proxy_persistent_storage_file = arg + elif opt == '-p': // dead option + print >>sys.stderr, "-p option is no longer supported, use -D\n" + print >>sys.stderr, __doc__ + sys.exit() elif opt == '-l': state.proxyPorts = [_addressAndPort(arg)] From npickett at users.sourceforge.net Mon Jan 27 13:12:51 2003 From: npickett at users.sourceforge.net (Neale Pickett) Date: Mon Jan 27 16:12:55 2003 Subject: [Spambayes-checkins] spambayes/contrib muttrc,NONE,1.1 spambayes.el,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/contrib In directory sc8-pr-cvs1:/tmp/cvs-serv27156 Modified Files: spambayes.el Added Files: muttrc Log Message: * Change comment on spambayes.el * added muttrc --- NEW FILE: muttrc --- ## ## Mutt keybindings for spambayes ## Author: Neale Pickett ## ## This muttrc assumes you are already filtering with a procmail recipie ## similar to: ## ## :0fw ## | hammiefilter.py -t ## ## ## This binds 'S' to refile as spam and move to a 'caughtspam' folder. ## 'H' will refile as spam and move to your inbox. You will want to use ## these on every misclassified message. ## ## As a special bonus, all tagged spam will be colored red on black. ## ## If you have any problems with this, and especially if you have any ## improvements, please mail them to me! Thanks to Adam Hupp for ## helping out with the muttisms. ## folder-hook . "macro index S '|hammiefilter.py -s\n =caughtspam\n'" folder-hook . "macro pager S '|hammiefilter.py -s\n =caughtspam\n'" folder-hook . "macro index H '|hammiefilter.py -g\r !\r'" folder-hook . "macro pager H '|hammiefilter.py -g\r !\r'" color index red black "~h 'X-Hammie-Disposition: spam' ~F" Index: spambayes.el =================================================================== RCS file: /cvsroot/spambayes/spambayes/contrib/spambayes.el,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** spambayes.el 23 Jan 2003 04:46:27 -0000 1.1 --- spambayes.el 27 Jan 2003 21:12:47 -0000 1.2 *************** *** 1,5 **** ;; spambayes.el -- integrate spambayes into Gnus ;; Copyright (C) 2003 Neale Pickett ! ;; Time-stamp: <2003-01-21 20:54:15 neale> ;; This is free software; you can redistribute it and/or modify it under --- 1,5 ---- ;; spambayes.el -- integrate spambayes into Gnus ;; Copyright (C) 2003 Neale Pickett ! ;; Time-stamp: <2003-01-27 13:12:14 neale> ;; This is free software; you can redistribute it and/or modify it under *************** *** 21,24 **** --- 21,33 ---- ;; Functions to put spambayes into Gnus. ;; + ;; This assumes you are already filtering with a procmail recipie + ;; similar to: + ;; + ;; :0fw + ;; | hammiefilter.py -t + ;; + ;; If you can't run procmail on all your incoming messages, you shold + ;; have a look at spam.el, which is included with Oort Gnus. + ;; ;; This binds "B s" to "refile as spam", and "B h" to "refile as ham". ;; After refiling, the message is rescored and respooled. I haven't yet *************** *** 46,75 **** "Path to the hammiefilter program") ! (defun spambayes-retrain (args) "Retrain on all processable articles, or the one under the cursor. ! This will replace the buffer contents with command output." ! (labels ((do-exec (n g args) (with-temp-buffer ! (gnus-request-article-this-buffer n g) ! (shell-command-on-region (point-min) (point-max) ! (concat spambayes-hammiefilter " " args) ! (current-buffer) ! t) ! (gnus-request-replace-article n g (current-buffer))))) ! (let ((g gnus-newsgroup-name) (list gnus-newsgroup-processable)) (if (>= (length list) 1) (while list (let ((n (car list))) ! (do-exec n g args)) (setq list (cdr list))) (let ((n (gnus-summary-article-number))) ! (do-exec n g args)))))) (defun spambayes-refile-as-spam () "Retrain and refilter all process-marked messages as spam, then respool them" (interactive) ! (spambayes-retrain "-s -f") (gnus-summary-respool-article nil (gnus-group-method gnus-newsgroup-name))) --- 55,96 ---- "Path to the hammiefilter program") ! (defun spambayes-retrain (is-spam) "Retrain on all processable articles, or the one under the cursor. ! This will replace the buffer contents with command output. You can then ! respool the article. ! ! is-spam is a boolean--true if you want to retrain the message as spam, ! false if you want to retrain as ham. ! " ! (labels ((do-exec (n group is-spam) ! (message "Retraining...") (with-temp-buffer ! (gnus-request-article-this-buffer n group) ! (shell-command-on-region ! (point-min) ! (point-max) ! (concat ! spambayes-hammiefilter ! (if is-spam " -s" " -g") ! " -f") ! (current-buffer) ! t) ! (gnus-request-replace-article n group (current-buffer))) ! (message "Retrained article."))) ! (let ((group gnus-newsgroup-name) (list gnus-newsgroup-processable)) (if (>= (length list) 1) (while list (let ((n (car list))) ! (do-exec n group is-spam)) (setq list (cdr list))) (let ((n (gnus-summary-article-number))) ! (do-exec n group is-spam)))))) (defun spambayes-refile-as-spam () "Retrain and refilter all process-marked messages as spam, then respool them" (interactive) ! (spambayes-retrain 't) (gnus-summary-respool-article nil (gnus-group-method gnus-newsgroup-name))) *************** *** 77,81 **** "Retrain and refilter all process-marked messages as ham, then respool them" (interactive) ! (spambayes-retrain "-g -f") (gnus-summary-respool-article nil (gnus-group-method gnus-newsgroup-name))) --- 98,102 ---- "Retrain and refilter all process-marked messages as ham, then respool them" (interactive) ! (spambayes-retrain nil) (gnus-summary-respool-article nil (gnus-group-method gnus-newsgroup-name))) From popiel at users.sourceforge.net Mon Jan 27 14:03:50 2003 From: popiel at users.sourceforge.net (T. Alexander Popiel) Date: Mon Jan 27 17:03:53 2003 Subject: [Spambayes-checkins] spambayes/contrib bulkgraph.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/contrib In directory sc8-pr-cvs1:/tmp/cvs-serv28696/spambayes/contrib Modified Files: bulkgraph.py Log Message: Some minor fixes to bulkgraph.py: Lop off the first column of the graph (which wasn't getting god data, anyway) Correct the text describing the arguments Add a scale tothe bottom of the graph marked in months Index: bulkgraph.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/contrib/bulkgraph.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** bulkgraph.py 23 Jan 2003 04:46:27 -0000 1.1 --- bulkgraph.py 27 Jan 2003 22:03:46 -0000 1.2 *************** *** 20,29 **** but much slower to load. Recommended for use with pop3proxy and hammiesrv. ! -g PATH ! mbox or directory of known good messages (non-spam) to train on. ! Can be specified more than once. -s PATH ! mbox or directory of known spam messages to train on. ! Can be specified more than once. -f force training, ignoring the trained header. Use this if you --- 20,29 ---- but much slower to load. Recommended for use with pop3proxy and hammiesrv. ! -e PATH ! directory of all messages (both ham and spam). -s PATH ! directory of known spam messages to train on. These should be ! duplicates of messages in the everything folder. Can be ! specified more than once. -f force training, ignoring the trained header. Use this if you *************** *** 59,63 **** def row(value, spamday, hamday, unsureday): line = "%5d|" % value ! for j in range((expire) // grouping, -1, -1): spamv = 0 hamv = 0 --- 59,63 ---- def row(value, spamday, hamday, unsureday): line = "%5d|" % value ! for j in range(((expire) // grouping) - 1, -1, -1): spamv = 0 hamv = 0 *************** *** 94,97 **** --- 94,122 ---- return line + def legend(): + line = " " * 60 + now = time.mktime(time.strptime(time.strftime("%d %b %Y"), "%d %b %Y")) + date = time.mktime(time.strptime(time.strftime("1 %b %Y"), "%d %b %Y")) + age = int(59 - ((now - date) // day // grouping)) + if age >= 55: + line = line[:age] + time.strftime("| %b") + else: + line = line[:(age)] + "|" + line[(age+1):] + center = int((age + 59) // 2) + line = line[:center] + time.strftime("%b") + line[center+3:] + date = time.mktime(time.strptime(time.strftime("1 %b %Y", time.localtime(date - day * 2)), "%d %b %Y")) + newage = int(59 - ((now - date) // day // grouping)) + while newage >= 0: + line = line[:newage] + "|" + line[newage+1:] + center = int((age + newage) // 2) + line = line[:center] + time.strftime("%b", time.localtime(date)) + line[center+3:] + age = newage + date = time.mktime(time.strptime(time.strftime("1 %b %Y", time.localtime(date - day * 2)), "%d %b %Y")) + newage = int(59 - ((now - date) // day // grouping)) + if age >= 4: + center = int((age) // 2) + line = line[:center-2] + time.strftime("%b", time.localtime(date)) + line[center+1:] + return line + def main(): """Main program; parse options and go.""" *************** *** 230,233 **** --- 255,259 ---- print row(scale * j, spamday, hamday, unsureday) print " +" + ('-' * 60) + print " " + legend() print From mhammond at users.sourceforge.net Mon Jan 27 14:30:27 2003 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Mon Jan 27 17:30:29 2003 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.46,1.47 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv14893 Modified Files: addin.py Log Message: PRevent an exception if we *never* entered a folder with mail items. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** addin.py 27 Jan 2003 14:12:20 -0000 1.46 --- addin.py 27 Jan 2003 22:30:24 -0000 1.47 *************** *** 387,390 **** --- 387,391 ---- self.have_setup_ui = False self.explorer_list = explorer_list + self.buttons = [] def SetupUI(self): From anthonybaxter at users.sourceforge.net Mon Jan 27 23:19:32 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 02:19:36 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.45,1.46 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv32092 Modified Files: pop3proxy.py Log Message: fixed C++-style comments. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** pop3proxy.py 27 Jan 2003 18:07:11 -0000 1.45 --- pop3proxy.py 28 Jan 2003 07:19:29 -0000 1.46 *************** *** 1516,1526 **** elif opt == '-b': state.launchUI = True ! elif opt == '-d': // dbm file state.useDB = True options.pop3proxy_persistent_storage_file = arg ! elif opt == '-D': // pickle file state.useDB = False options.pop3proxy_persistent_storage_file = arg ! elif opt == '-p': // dead option print >>sys.stderr, "-p option is no longer supported, use -D\n" print >>sys.stderr, __doc__ --- 1516,1526 ---- elif opt == '-b': state.launchUI = True ! elif opt == '-d': # dbm file state.useDB = True options.pop3proxy_persistent_storage_file = arg ! elif opt == '-D': # pickle file state.useDB = False options.pop3proxy_persistent_storage_file = arg ! elif opt == '-p': # dead option print >>sys.stderr, "-p option is no longer supported, use -D\n" print >>sys.stderr, __doc__ From anthonybaxter at users.sourceforge.net Mon Jan 27 23:27:48 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 02:27:51 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.46,1.47 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv5514 Modified Files: pop3proxy.py Log Message: minor edits to make some (harmless) pychecker warnings go away. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** pop3proxy.py 28 Jan 2003 07:19:29 -0000 1.46 --- pop3proxy.py 28 Jan 2003 07:27:45 -0000 1.47 *************** *** 139,150 **** import StringIO ! import os, sys, re, operator, errno, getopt, string, time, bisect ! import socket, asyncore, asynchat, cgi, urlparse, webbrowser import mailbox, email.Header import spambayes from spambayes import storage, tokenizer, mboxutils, PyMeldLite, Dibbler from spambayes.FileCorpus import FileCorpus, ExpiryFileCorpus from spambayes.FileCorpus import FileMessageFactory, GzipFileMessageFactory - from email.Iterators import typed_subpart_iterator from spambayes.OptionConfig import OptionsConfigurator from spambayes.Options import options --- 139,150 ---- import StringIO ! import os, sys, re, operator, errno, getopt, time, bisect ! import socket, asyncore, asynchat, cgi import mailbox, email.Header + from email.Iterators import typed_subpart_iterator import spambayes from spambayes import storage, tokenizer, mboxutils, PyMeldLite, Dibbler from spambayes.FileCorpus import FileCorpus, ExpiryFileCorpus from spambayes.FileCorpus import FileMessageFactory, GzipFileMessageFactory from spambayes.OptionConfig import OptionsConfigurator from spambayes.Options import options *************** *** 163,166 **** --- 163,168 ---- can't connect to the real POP3 server and talk to it synchronously, because that would block the process.""" + + lineCallback = None def __init__(self, serverName, serverPort, lineCallback): From anthonybaxter at users.sourceforge.net Mon Jan 27 23:38:18 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 02:38:22 2003 Subject: [Spambayes-checkins] spambayes .cvsignore,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv14724 Modified Files: .cvsignore Log Message: ignore some distutils artifacts. Index: .cvsignore =================================================================== RCS file: /cvsroot/spambayes/spambayes/.cvsignore,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** .cvsignore 20 Sep 2002 15:24:54 -0000 1.3 --- .cvsignore 28 Jan 2003 07:38:15 -0000 1.4 *************** *** 4,7 **** --- 4,9 ---- *.pik *.zip + dist build + MANIFEST Data From anthonybaxter at users.sourceforge.net Mon Jan 27 23:39:36 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 02:39:40 2003 Subject: [Spambayes-checkins] spambayes/spambayes Corpus.py,1.4,1.5 PyMeldLite.py,1.5,1.6 hammiebulk.py,1.3,1.4 storage.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv15055/spambayes Modified Files: Corpus.py PyMeldLite.py hammiebulk.py storage.py Log Message: a few cleanups from pychecker. still bazillions left, but while I'm browsing code, may as well make some small cleanups. Index: Corpus.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Corpus.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** Corpus.py 18 Jan 2003 16:30:57 -0000 1.4 --- Corpus.py 28 Jan 2003 07:39:31 -0000 1.5 *************** *** 252,256 **** class ExpiryCorpus: ! '''Corpus of "young" file system artifacts''' def __init__(self, expireBefore): --- 252,256 ---- class ExpiryCorpus: ! '''Mixin Class - Corpus of "young" file system artifacts''' def __init__(self, expireBefore): Index: PyMeldLite.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/PyMeldLite.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** PyMeldLite.py 23 Jan 2003 18:28:15 -0000 1.5 --- PyMeldLite.py 28 Jan 2003 07:39:34 -0000 1.6 *************** *** 708,713 **** if attribute is not _fail: return self._unquoteAttribute(attribute) ! else: ! raise AttributeError, "No element or attribute named %r" % name def __setattr__(self, name, value): --- 708,712 ---- if attribute is not _fail: return self._unquoteAttribute(attribute) ! raise AttributeError, "No element or attribute named %r" % name def __setattr__(self, name, value): Index: hammiebulk.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/hammiebulk.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** hammiebulk.py 14 Jan 2003 17:12:53 -0000 1.3 --- hammiebulk.py 28 Jan 2003 07:39:34 -0000 1.4 *************** *** 50,60 **** import sys import os - import types import getopt - import mailbox - import glob - import email - import errno - import cPickle as pickle from spambayes.Options import options --- 50,54 ---- Index: storage.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/storage.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** storage.py 14 Jan 2003 05:38:20 -0000 1.2 --- storage.py 28 Jan 2003 07:39:34 -0000 1.3 *************** *** 276,278 **** if __name__ == '__main__': ! print >>sys.stderr, __doc__ --- 276,279 ---- if __name__ == '__main__': ! import sys ! print >> sys.stderr, __doc__ From anthonybaxter at users.sourceforge.net Tue Jan 28 19:23:36 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 22:23:41 2003 Subject: [Spambayes-checkins] spambayes/pspam/pspam options.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/pspam/pspam In directory sc8-pr-cvs1:/tmp/cvs-serv9465/pspam/pspam Modified Files: options.py Log Message: Whitespace n11n (if localization can be l10n, normalization can be n11n :) Index: options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pspam/pspam/options.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** options.py 20 Jan 2003 15:05:45 -0000 1.4 --- options.py 29 Jan 2003 03:23:34 -0000 1.5 *************** *** 1,5 **** from spambayes.Options import options, all_options, \ boolean_cracker, float_cracker, int_cracker, string_cracker ! try: from sets import Set except ImportError: --- 1,5 ---- from spambayes.Options import options, all_options, \ boolean_cracker, float_cracker, int_cracker, string_cracker ! try: from sets import Set except ImportError: From anthonybaxter at users.sourceforge.net Tue Jan 28 19:23:36 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 22:23:47 2003 Subject: [Spambayes-checkins] spambayes/contrib SmarterHTTPServer.py,1.1,1.2 bulkgraph.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/contrib In directory sc8-pr-cvs1:/tmp/cvs-serv9465/contrib Modified Files: SmarterHTTPServer.py bulkgraph.py Log Message: Whitespace n11n (if localization can be l10n, normalization can be n11n :) Index: SmarterHTTPServer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/contrib/SmarterHTTPServer.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** SmarterHTTPServer.py 24 Jan 2003 20:34:05 -0000 1.1 --- SmarterHTTPServer.py 29 Jan 2003 03:23:34 -0000 1.2 *************** *** 37,41 **** server_version = "SmarterHTTP/" + __version__ ! def send_head(self): --- 37,41 ---- server_version = "SmarterHTTP/" + __version__ ! def send_head(self): *************** *** 63,67 **** else: return self.list_directory(path) ! ctype = self.guess_type(path) --- 63,67 ---- else: return self.list_directory(path) ! ctype = self.guess_type(path) *************** *** 87,91 **** if parms: pdict = cgi.parse_qs(parms, False) ! # ctype application/method methlets (invented here) may # send whatever headers they like. However, the server has --- 87,91 ---- if parms: pdict = cgi.parse_qs(parms, False) ! # ctype application/method methlets (invented here) may # send whatever headers they like. However, the server has *************** *** 95,99 **** # anything incompatible with text/html type. Methlets should # not invoke end_headers(). ! if hasattr(self, methname): self.send_response(200) --- 95,99 ---- # anything incompatible with text/html type. Methlets should # not invoke end_headers(). ! if hasattr(self, methname): self.send_response(200) Index: bulkgraph.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/contrib/bulkgraph.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** bulkgraph.py 27 Jan 2003 22:03:46 -0000 1.2 --- bulkgraph.py 29 Jan 2003 03:23:34 -0000 1.3 *************** *** 123,127 **** global loud ! try: opts, args = getopt.getopt(sys.argv[1:], 'hfqd:D:s:e:') --- 123,127 ---- global loud ! try: opts, args = getopt.getopt(sys.argv[1:], 'hfqd:D:s:e:') *************** *** 196,210 **** age = 2 * expire try: ! received = (msg.get_all("Received"))[0] ! received = date_re.search(received).group(1) ! # if loud: print " %s" % received ! date = time.mktime(time.strptime(received, "%d %b %Y")) ! # if loud: print " %d" % date ! age = (now - date) // day ! # Can't just continue here... we're in a try ! if age < 0: ! age = 2 * expire except: ! pass # Skip anything that has no date or is too old or from the future # if loud: print "%s: %d" % (name, age) --- 196,210 ---- age = 2 * expire try: ! received = (msg.get_all("Received"))[0] ! received = date_re.search(received).group(1) ! # if loud: print " %s" % received ! date = time.mktime(time.strptime(received, "%d %b %Y")) ! # if loud: print " %d" % date ! age = (now - date) // day ! # Can't just continue here... we're in a try ! if age < 0: ! age = 2 * expire except: ! pass # Skip anything that has no date or is too old or from the future # if loud: print "%s: %d" % (name, age) *************** *** 243,247 **** sys.stdout.write("h") sys.stdout.flush() ! h.train(msg, isspam) --- 243,247 ---- sys.stdout.write("h") sys.stdout.flush() ! h.train(msg, isspam) From anthonybaxter at users.sourceforge.net Tue Jan 28 19:23:36 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 22:23:52 2003 Subject: [Spambayes-checkins] spambayes hammiefilter.py,1.13,1.14 mailsort.py,1.4,1.5 mboxtrain.py,1.3,1.4 pop3proxy.py,1.47,1.48 unheader.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9465 Modified Files: hammiefilter.py mailsort.py mboxtrain.py pop3proxy.py unheader.py Log Message: Whitespace n11n (if localization can be l10n, normalization can be n11n :) Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** hammiefilter.py 23 Jan 2003 20:15:31 -0000 1.13 --- hammiefilter.py 29 Jan 2003 03:23:34 -0000 1.14 *************** *** 8,12 **** ## :0 fw ## | hammiefilter.py ! ## ## Then, you can set up your MUA to pipe ham and spam to it, one at a ## time, by calling it with either the -g or -s options, respectively. --- 8,12 ---- ## :0 fw ## | hammiefilter.py ! ## ## Then, you can set up your MUA to pipe ham and spam to it, one at a ## time, by calling it with either the -g or -s options, respectively. *************** *** 177,179 **** if __name__ == "__main__": main() - --- 177,178 ---- Index: mailsort.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mailsort.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** mailsort.py 20 Jan 2003 03:39:52 -0000 1.4 --- mailsort.py 29 Jan 2003 03:23:34 -0000 1.5 *************** *** 165,170 **** else: raise RuntimeError # shouldn't get here ! ! if __name__ == "__main__": main() --- 165,170 ---- else: raise RuntimeError # shouldn't get here ! ! if __name__ == "__main__": main() Index: mboxtrain.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mboxtrain.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** mboxtrain.py 14 Jan 2003 05:38:19 -0000 1.3 --- mboxtrain.py 29 Jan 2003 03:23:34 -0000 1.4 *************** *** 52,56 **** # We'll be unable to represent this as text :( return False ! if is_spam: spamtxt = "spam" --- 52,56 ---- # We'll be unable to represent this as text :( return False ! if is_spam: spamtxt = "spam" *************** *** 90,94 **** counter += 1 cfn = os.path.join(path, "cur", fn) ! tfn = os.path.join(path, "tmp", "%d.%d_%d.%s" % (time.time(), pid, counter, host)) --- 90,94 ---- counter += 1 cfn = os.path.join(path, "cur", fn) ! tfn = os.path.join(path, "tmp", "%d.%d_%d.%s" % (time.time(), pid, counter, host)) *************** *** 127,131 **** fcntl.flock(f, fcntl.LOCK_EX) mbox = mailbox.PortableUnixMailbox(f, mboxutils.get_message) ! outf = os.tmpfile() counter = 0 --- 127,131 ---- fcntl.flock(f, fcntl.LOCK_EX) mbox = mailbox.PortableUnixMailbox(f, mboxutils.get_message) ! outf = os.tmpfile() counter = 0 *************** *** 173,180 **** counter = 0 trained = 0 ! for fn in glob.glob(os.path.join(path, "[0-9]*")): counter += 1 ! cfn = fn tfn = os.path.join(path, "spambayes.tmp") --- 173,180 ---- counter = 0 trained = 0 ! for fn in glob.glob(os.path.join(path, "[0-9]*")): counter += 1 ! cfn = fn tfn = os.path.join(path, "spambayes.tmp") *************** *** 190,194 **** f.write(msg.as_string()) f.close() ! # XXX: This will raise an exception on Windows. Do any Windows # people actually use MH directories? --- 190,194 ---- f.write(msg.as_string()) f.close() ! # XXX: This will raise an exception on Windows. Do any Windows # people actually use MH directories? *************** *** 208,212 **** else: raise ValueError("Unable to determine mailbox type: " + path) ! def usage(code, msg=''): --- 208,212 ---- else: raise ValueError("Unable to determine mailbox type: " + path) ! def usage(code, msg=''): *************** *** 222,226 **** global loud ! try: opts, args = getopt.getopt(sys.argv[1:], 'hfqd:D:g:s:') --- 222,226 ---- global loud ! try: opts, args = getopt.getopt(sys.argv[1:], 'hfqd:D:g:s:') Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** pop3proxy.py 28 Jan 2003 07:27:45 -0000 1.47 --- pop3proxy.py 29 Jan 2003 03:23:34 -0000 1.48 *************** *** 1207,1224 **** # Option-parsing helper functions def _addressAndPort(s): ! """Decode a string representing a port to bind to, with optional address.""" ! s = s.strip() ! if ':' in s: ! addr, port = s.split(':') ! return addr, int(port) ! else: ! return '', int(s) def _addressPortStr((addr, port)): ! """Encode a string representing a port to bind to, with optional address.""" ! if not addr: ! return str(port) ! else: ! return '%s:%d' % (addr, port) --- 1207,1224 ---- # Option-parsing helper functions def _addressAndPort(s): ! """Decode a string representing a port to bind to, with optional address.""" ! s = s.strip() ! if ':' in s: ! addr, port = s.split(':') ! return addr, int(port) ! else: ! return '', int(s) def _addressPortStr((addr, port)): ! """Encode a string representing a port to bind to, with optional address.""" ! if not addr: ! return str(port) ! else: ! return '%s:%d' % (addr, port) Index: unheader.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/unheader.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** unheader.py 14 Jan 2003 05:38:19 -0000 1.9 --- unheader.py 29 Jan 2003 03:23:34 -0000 1.10 *************** *** 1,9 **** #!/usr/bin/env python """ ! unheader.py: cleans headers from email messages. By default, this removes SpamAssassin headers, specify a pattern with -p to supply new headers to remove. ! This is often needed because existing spamassassin headers can provide killer spam clues, for all the wrong reasons. """ --- 1,9 ---- #!/usr/bin/env python """ ! unheader.py: cleans headers from email messages. By default, this removes SpamAssassin headers, specify a pattern with -p to supply new headers to remove. ! This is often needed because existing spamassassin headers can provide killer spam clues, for all the wrong reasons. """ From anthonybaxter at users.sourceforge.net Tue Jan 28 19:23:37 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 22:23:57 2003 Subject: [Spambayes-checkins] spambayes/utilities pop3graph.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/utilities In directory sc8-pr-cvs1:/tmp/cvs-serv9465/utilities Modified Files: pop3graph.py Log Message: Whitespace n11n (if localization can be l10n, normalization can be n11n :) Index: pop3graph.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/utilities/pop3graph.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** pop3graph.py 17 Jan 2003 06:45:37 -0000 1.1 --- pop3graph.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 15,88 **** def usage(): ! print __doc__ def main(argv): ! opts, args = getopt.getopt(argv, "h", ["help"]) ! for opt, arg in opts: ! if opt in ("-h", "--help"): ! usage() ! return ! # Create the corpuses and the factory that reads the messages. ! if options.pop3proxy_cache_use_gzip: ! messageFactory = GzipFileMessageFactory() ! else: ! messageFactory = FileMessageFactory() ! spamCorpus = FileCorpus(messageFactory, options.pop3proxy_spam_cache) ! hamCorpus = FileCorpus(messageFactory, options.pop3proxy_ham_cache) ! # Read in all the trained messages. ! allTrained = {} ! for corpus, disposition in [(spamCorpus, 'Yes'), (hamCorpus, 'No')]: ! for m in corpus: ! message = mboxutils.get_message(m.getSubstance()) ! message._pop3CacheDisposition = disposition ! allTrained[m.key()] = message ! # Sort the messages into the order they arrived, then work out a scaling ! # factor for the graph - 'limit' is the widest it can be in characters. ! keys = allTrained.keys() ! keys.sort() ! limit = 70 ! if len(keys) < limit: ! scale = 1 ! else: ! scale = len(keys) // (limit//2) ! # Build the data - an array of cumulative success indexed by count. ! count = successful = 0 ! successByCount = [] ! for key in keys: ! message = allTrained[key] ! disposition = message[options.hammie_header_name] ! if (message._pop3CacheDisposition == disposition): ! successful += 1 ! count += 1 ! if count % scale == (scale-1): ! successByCount.append(successful // scale) ! # Build the graph, as a list of rows of characters. ! size = count // scale ! graph = [[" " for i in range(size+3)] for j in range(size)] ! for c in range(size): ! graph[c][1] = "|" ! graph[c][c+3] = "." ! graph[successByCount[c]][c+3] = "*" ! graph.reverse() ! # Print the graph. ! print "\n Success of the classifier over time:\n" ! print " . - Number of messages over time" ! print " * - Number of correctly classified messages over time\n\n" ! for row in range(size): ! line = ''.join(graph[row]) ! if row == 0: ! print line + " %d" % count ! elif row == (count - successful) // scale: ! print line + " %d" % successful ! else: ! print line ! print " " + "_" * (size+2) if __name__ == '__main__': ! main(sys.argv[1:]) --- 15,88 ---- def usage(): ! print __doc__ def main(argv): ! opts, args = getopt.getopt(argv, "h", ["help"]) ! for opt, arg in opts: ! if opt in ("-h", "--help"): ! usage() ! return ! # Create the corpuses and the factory that reads the messages. ! if options.pop3proxy_cache_use_gzip: ! messageFactory = GzipFileMessageFactory() ! else: ! messageFactory = FileMessageFactory() ! spamCorpus = FileCorpus(messageFactory, options.pop3proxy_spam_cache) ! hamCorpus = FileCorpus(messageFactory, options.pop3proxy_ham_cache) ! # Read in all the trained messages. ! allTrained = {} ! for corpus, disposition in [(spamCorpus, 'Yes'), (hamCorpus, 'No')]: ! for m in corpus: ! message = mboxutils.get_message(m.getSubstance()) ! message._pop3CacheDisposition = disposition ! allTrained[m.key()] = message ! # Sort the messages into the order they arrived, then work out a scaling ! # factor for the graph - 'limit' is the widest it can be in characters. ! keys = allTrained.keys() ! keys.sort() ! limit = 70 ! if len(keys) < limit: ! scale = 1 ! else: ! scale = len(keys) // (limit//2) ! # Build the data - an array of cumulative success indexed by count. ! count = successful = 0 ! successByCount = [] ! for key in keys: ! message = allTrained[key] ! disposition = message[options.hammie_header_name] ! if (message._pop3CacheDisposition == disposition): ! successful += 1 ! count += 1 ! if count % scale == (scale-1): ! successByCount.append(successful // scale) ! # Build the graph, as a list of rows of characters. ! size = count // scale ! graph = [[" " for i in range(size+3)] for j in range(size)] ! for c in range(size): ! graph[c][1] = "|" ! graph[c][c+3] = "." ! graph[successByCount[c]][c+3] = "*" ! graph.reverse() ! # Print the graph. ! print "\n Success of the classifier over time:\n" ! print " . - Number of messages over time" ! print " * - Number of correctly classified messages over time\n\n" ! for row in range(size): ! line = ''.join(graph[row]) ! if row == 0: ! print line + " %d" % count ! elif row == (count - successful) // scale: ! print line + " %d" % successful ! else: ! print line ! print " " + "_" * (size+2) if __name__ == '__main__': ! main(sys.argv[1:]) From anthonybaxter at users.sourceforge.net Tue Jan 28 19:23:37 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 22:24:02 2003 Subject: [Spambayes-checkins] spambayes/testtools mboxtest.py,1.2,1.3 simplexloop.py,1.2,1.3 weaktest.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/testtools In directory sc8-pr-cvs1:/tmp/cvs-serv9465/testtools Modified Files: mboxtest.py simplexloop.py weaktest.py Log Message: Whitespace n11n (if localization can be l10n, normalization can be n11n :) Index: mboxtest.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/testtools/mboxtest.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** mboxtest.py 17 Jan 2003 06:42:54 -0000 1.2 --- mboxtest.py 29 Jan 2003 03:23:35 -0000 1.3 *************** *** 25,29 **** import random import re ! try: from sets import Set except ImportError: --- 25,29 ---- import random import re ! try: from sets import Set except ImportError: Index: simplexloop.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/testtools/simplexloop.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** simplexloop.py 17 Jan 2003 06:42:54 -0000 1.2 --- simplexloop.py 29 Jan 2003 03:23:35 -0000 1.3 *************** *** 8,12 **** Show usage and exit. -c command ! The command to be run, with all its options. The last line of output from this program should match 'YYYYYYY cost: $xxxx.xx' --- 8,12 ---- Show usage and exit. -c command ! The command to be run, with all its options. The last line of output from this program should match 'YYYYYYY cost: $xxxx.xx' Index: weaktest.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/testtools/weaktest.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** weaktest.py 17 Jan 2003 06:42:54 -0000 1.2 --- weaktest.py 29 Jan 2003 03:23:35 -0000 1.3 *************** *** 21,25 **** Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...). This is required. ! -d decider Name of the decider. One of %(decisionkeys)s -m min --- 21,25 ---- Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...). This is required. ! -d decider Name of the decider. One of %(decisionkeys)s -m min *************** *** 63,80 **** def spamtrain(self,scr): if scr < options.spam_cutoff: ! return TRAIN_AS_SPAM def hamtrain(self,scr): if scr > options.ham_cutoff: ! return TRAIN_AS_HAM class UnsureOnly(TrainDecision): def spamtrain(self,scr): if options.ham_cutoff < scr < options.spam_cutoff: ! return TRAIN_AS_SPAM def hamtrain(self,scr): if options.ham_cutoff < scr < options.spam_cutoff: ! return TRAIN_AS_HAM class All(TrainDecision): --- 63,80 ---- def spamtrain(self,scr): if scr < options.spam_cutoff: ! return TRAIN_AS_SPAM def hamtrain(self,scr): if scr > options.ham_cutoff: ! return TRAIN_AS_HAM class UnsureOnly(TrainDecision): def spamtrain(self,scr): if options.ham_cutoff < scr < options.spam_cutoff: ! return TRAIN_AS_SPAM def hamtrain(self,scr): if options.ham_cutoff < scr < options.spam_cutoff: ! return TRAIN_AS_HAM class All(TrainDecision): *************** *** 88,92 **** def spamtrain(self,scr): if scr < 0.995: ! return TRAIN_AS_SPAM def hamtrain(self,scr): --- 88,92 ---- def spamtrain(self,scr): if scr < 0.995: ! return TRAIN_AS_SPAM def hamtrain(self,scr): *************** *** 97,103 **** def hamtrain(self,scr): if scr < options.ham_cutoff: ! return TRAIN_AS_HAM elif scr > options.spam_cutoff: ! return TRAIN_AS_SPAM spamtrain = hamtrain --- 97,103 ---- def hamtrain(self,scr): if scr < options.ham_cutoff: ! return TRAIN_AS_HAM elif scr > options.spam_cutoff: ! return TRAIN_AS_SPAM spamtrain = hamtrain *************** *** 127,136 **** if self.tooearly(): if is_spam: ! return TRAIN_AS_SPAM else: ! return TRAIN_AS_HAM else: return self.client(scr,is_spam) ! def tooearly(self): return self.x < self.n --- 127,136 ---- if self.tooearly(): if is_spam: ! return TRAIN_AS_SPAM else: ! return TRAIN_AS_HAM else: return self.client(scr,is_spam) ! def tooearly(self): return self.x < self.n *************** *** 182,187 **** print "Ham with score %.2f"%scr cc.ham(scr) ! de = decision(scr,is_spam) ! if de == TRAIN_AS_SPAM: d.train_spam(m) spamtrain += 1 --- 182,187 ---- print "Ham with score %.2f"%scr cc.ham(scr) ! de = decision(scr,is_spam) ! if de == TRAIN_AS_SPAM: d.train_spam(m) spamtrain += 1 From anthonybaxter at users.sourceforge.net Tue Jan 28 19:23:37 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 22:24:07 2003 Subject: [Spambayes-checkins] spambayes/spambayes/resources classify_gif.py,1.1,1.2 config_gif.py,1.1,1.2 helmet_gif.py,1.1,1.2 message_gif.py,1.1,1.2 query_gif.py,1.1,1.2 scanning__init__.py,1.1,1.2 status_gif.py,1.1,1.2 train_gif.py,1.1,1.2 ui_psp.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes/resources In directory sc8-pr-cvs1:/tmp/cvs-serv9465/spambayes/resources Modified Files: classify_gif.py config_gif.py helmet_gif.py message_gif.py query_gif.py scanning__init__.py status_gif.py train_gif.py ui_psp.py Log Message: Whitespace n11n (if localization can be l10n, normalization can be n11n :) Index: classify_gif.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/classify_gif.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** classify_gif.py 17 Jan 2003 20:21:15 -0000 1.1 --- classify_gif.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 58,61 **** \x00\x12\x94 \x01a\x01@@\xc2\x05\x02\xfc\x14\xd4D\'\xa5\xb4\x1cK.\xd9V\x91F\ \x1cy\x04\x92H$a\xa4\xe2\x8a,\xb6\xe8\xe2\x8b0\xc6(\xe3\x8c4\xd6h\xe3\x8d8\ ! \xe6\xa8\xe3\x8e+\x06\x04\x00;' ### end --- 58,61 ---- \x00\x12\x94 \x01a\x01@@\xc2\x05\x02\xfc\x14\xd4D\'\xa5\xb4\x1cK.\xd9V\x91F\ \x1cy\x04\x92H$a\xa4\xe2\x8a,\xb6\xe8\xe2\x8b0\xc6(\xe3\x8c4\xd6h\xe3\x8d8\ ! \xe6\xa8\xe3\x8e+\x06\x04\x00;' ### end Index: config_gif.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/config_gif.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** config_gif.py 17 Jan 2003 20:21:16 -0000 1.1 --- config_gif.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 49,52 **** \xcf\xa5=\'\x02\x85\t\xf5&E\xa3\x1c\xaa\x12mj\xb4\xebV\xae]u~\xe5\x18V\xec\ \xc9\xaaLG\xa2\x1d{\x11*\xdb\x8d-\xdf~,)Wm\xd4\x95$\xeb\xe2\xdd\xcb\xb7\xaf\ ! \xdf\xbf\x80\x03\x0b\x1eL\xb8\xb0\xe1\x8b\x01\x01\x00;' ### end --- 49,52 ---- \xcf\xa5=\'\x02\x85\t\xf5&E\xa3\x1c\xaa\x12mj\xb4\xebV\xae]u~\xe5\x18V\xec\ \xc9\xaaLG\xa2\x1d{\x11*\xdb\x8d-\xdf~,)Wm\xd4\x95$\xeb\xe2\xdd\xcb\xb7\xaf\ ! \xdf\xbf\x80\x03\x0b\x1eL\xb8\xb0\xe1\x8b\x01\x01\x00;' ### end Index: helmet_gif.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/helmet_gif.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** helmet_gif.py 17 Jan 2003 20:21:18 -0000 1.1 --- helmet_gif.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 62,65 **** \xe3\x08&\x84\x10e\x02\x15\xcc\xa0\x9dD1hI\xddn\x10@\xc0\xa4\x08\x15P\x00\ \x81\x03\x10\x1c\xf6S\x07\x1a\xe4\xa9A\x07|r\xc0\x01\x9f\x80v\x00Q@\x00\x00;\ ! ' ### end --- 62,65 ---- \xe3\x08&\x84\x10e\x02\x15\xcc\xa0\x9dD1hI\xddn\x10@\xc0\xa4\x08\x15P\x00\ \x81\x03\x10\x1c\xf6S\x07\x1a\xe4\xa9A\x07|r\xc0\x01\x9f\x80v\x00Q@\x00\x00;\ ! ' ### end Index: message_gif.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/message_gif.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** message_gif.py 17 Jan 2003 20:21:21 -0000 1.1 --- message_gif.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 49,52 **** \xa3\xc7\x83H\x97V\x8c\xeas \xd5\x91N5B\x95\x1a\xb4jW\x9e^\x8b\xe6\xdc\x9au\ \xeb\xd8\xaaW\xd1r\x15\xeb5\xed\xd0\xb5IO\xc2]\xab\xb4f\xd6\xbbO\xc3\xea\xdd\ ! \xcb\xb7\xaf\xdf\xbf\x80\x03\x0b\x1e\xbc0 \x00;' ### end --- 49,52 ---- \xa3\xc7\x83H\x97V\x8c\xeas \xd5\x91N5B\x95\x1a\xb4jW\x9e^\x8b\xe6\xdc\x9au\ \xeb\xd8\xaaW\xd1r\x15\xeb5\xed\xd0\xb5IO\xc2]\xab\xb4f\xd6\xbbO\xc3\xea\xdd\ ! \xcb\xb7\xaf\xdf\xbf\x80\x03\x0b\x1e\xbc0 \x00;' ### end Index: query_gif.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/query_gif.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** query_gif.py 17 Jan 2003 20:21:22 -0000 1.1 --- query_gif.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 57,60 **** `\x8d\xf1\xc5\xf5\'\x90\x0f\xe3\xc1%\xa0@\x07\x8c\xd7\xd6\x81\x11\x8c\xa7\ \xd9\x81\x1a\x8c\x87\xc2\x81@\xb4F\x98\x07\x14\xb20\xdej\x07\xfeG\xd8\x02\ ! \x14\xeaE\xd8\x00\x14\x02aB\x01\x03tU\xe2\x8a,\xb6\xe8\xe2Q\x01\x01\x00;' ### end --- 57,60 ---- `\x8d\xf1\xc5\xf5\'\x90\x0f\xe3\xc1%\xa0@\x07\x8c\xd7\xd6\x81\x11\x8c\xa7\ \xd9\x81\x1a\x8c\x87\xc2\x81@\xb4F\x98\x07\x14\xb20\xdej\x07\xfeG\xd8\x02\ ! \x14\xeaE\xd8\x00\x14\x02aB\x01\x03tU\xe2\x8a,\xb6\xe8\xe2Q\x01\x01\x00;' ### end Index: scanning__init__.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/scanning__init__.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** scanning__init__.py 17 Jan 2003 20:21:22 -0000 1.1 --- scanning__init__.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 12,44 **** """ try: ! __file__ except NameError: ! pass else: ! import os ! if os.path.splitext(os.path.basename( __file__ ))[0] == "__init__": ! try: ! from resourcepackage import package, defaultgenerators ! generators = defaultgenerators.generators.copy() ! ! ### CUSTOMISATION POINT ! ## import specialised generators here, such as for wxPython ! #from resourcepackage import wxgenerators ! #generators.update( wxgenerators.generators ) ! except ImportError: ! pass ! else: ! package = package.Package( ! packageName = __name__, ! directory = os.path.dirname( os.path.abspath(__file__) ), ! generators = generators, ! ) ! package.scan( ! ### CUSTOMISATION POINT ! ## force true -> always re-loads from external files, otherwise ! ## only reloads if the file is newer than the generated .py file. ! # force = 1, ! ) ! # ResourcePackage license added by Richie Hindle , --- 12,44 ---- """ try: ! __file__ except NameError: ! pass else: ! import os ! if os.path.splitext(os.path.basename( __file__ ))[0] == "__init__": ! try: ! from resourcepackage import package, defaultgenerators ! generators = defaultgenerators.generators.copy() ! ! ### CUSTOMISATION POINT ! ## import specialised generators here, such as for wxPython ! #from resourcepackage import wxgenerators ! #generators.update( wxgenerators.generators ) ! except ImportError: ! pass ! else: ! package = package.Package( ! packageName = __name__, ! directory = os.path.dirname( os.path.abspath(__file__) ), ! generators = generators, ! ) ! package.scan( ! ### CUSTOMISATION POINT ! ## force true -> always re-loads from external files, otherwise ! ## only reloads if the file is newer than the generated .py file. ! # force = 1, ! ) ! # ResourcePackage license added by Richie Hindle , *************** *** 52,87 **** ResourcePackage License ! Copyright (c) 2003, Michael C. Fletcher, All rights reserved. ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions ! are met: ! ! Redistributions of source code must retain the above copyright ! notice, this list of conditions and the following disclaimer. ! ! Redistributions in binary form must reproduce the above ! copyright notice, this list of conditions and the following ! disclaimer in the documentation and/or other materials ! provided with the distribution. ! ! The name of Michael C. Fletcher, or the name of any Contributor, ! may not be used to endorse or promote products derived from this ! software without specific prior written permission. ! ! THIS SOFTWARE IS NOT FAULT TOLERANT AND SHOULD NOT BE USED IN ANY ! SITUATION ENDANGERING HUMAN LIFE OR PROPERTY. ! ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ! COPYRIGHT HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, ! INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ! (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ! SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ! HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ! STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED ! OF THE POSSIBILITY OF SUCH DAMAGE. """ --- 52,87 ---- ResourcePackage License ! Copyright (c) 2003, Michael C. Fletcher, All rights reserved. ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions ! are met: ! ! Redistributions of source code must retain the above copyright ! notice, this list of conditions and the following disclaimer. ! ! Redistributions in binary form must reproduce the above ! copyright notice, this list of conditions and the following ! disclaimer in the documentation and/or other materials ! provided with the distribution. ! ! The name of Michael C. Fletcher, or the name of any Contributor, ! may not be used to endorse or promote products derived from this ! software without specific prior written permission. ! ! THIS SOFTWARE IS NOT FAULT TOLERANT AND SHOULD NOT BE USED IN ANY ! SITUATION ENDANGERING HUMAN LIFE OR PROPERTY. ! ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ! COPYRIGHT HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, ! INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ! (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ! SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ! HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ! STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED ! OF THE POSSIBILITY OF SUCH DAMAGE. """ Index: status_gif.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/status_gif.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** status_gif.py 17 Jan 2003 20:21:23 -0000 1.1 --- status_gif.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 58,61 **** \x94d\xba\x19\x85\x94R \xf5\xa0\x1fd1\xcdT\xd3M9\x91\x84\x01\x81\x12\x89t\ \xd2\x8a)\x9dF\x91E\x18i\xc4\x91G.\xd6h\xe3\x8d8\xe6\xa8\xe3\x8e<\xf6\x98P@\ ! \x00\x00;' ### end --- 58,61 ---- \x94d\xba\x19\x85\x94R \xf5\xa0\x1fd1\xcdT\xd3M9\x91\x84\x01\x81\x12\x89t\ \xd2\x8a)\x9dF\x91E\x18i\xc4\x91G.\xd6h\xe3\x8d8\xe6\xa8\xe3\x8e<\xf6\x98P@\ ! \x00\x00;' ### end Index: train_gif.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/train_gif.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** train_gif.py 17 Jan 2003 20:21:25 -0000 1.1 --- train_gif.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 64,67 **** \x08\x14U\xe1K?\x85\x06R\x0c\x0c\xe4t@M#,\x00$\x04A\xb5T\x90\x003\x00A\xc3\ \x07 \x12x0\x12\x08U\xb6T\x82\x01\x00@\x00f\x00\x14\x84\xa8\xea\xaf\xc0\x06+\ ! \xec\xb0\xc4\x16k\xacB\x01\x01\x00;' ### end --- 64,67 ---- \x08\x14U\xe1K?\x85\x06R\x0c\x0c\xe4t@M#,\x00$\x04A\xb5T\x90\x003\x00A\xc3\ \x07 \x12x0\x12\x08U\xb6T\x82\x01\x00@\x00f\x00\x14\x84\xa8\xea\xaf\xc0\x06+\ ! \xec\xb0\xc4\x16k\xacB\x01\x01\x00;' ### end Index: ui_psp.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/ui_psp.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ui_psp.py 17 Jan 2003 20:21:26 -0000 1.1 --- ui_psp.py 29 Jan 2003 03:23:35 -0000 1.2 *************** *** 361,364 **** \x00\x00\x00\x06\x00\x00\x00\x01\x00\x08\x00\x00\x00C\x00\x00\x00\x06\x00\ \x00\x00\x01\x00\x08\x00\x00\x00D\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\ ! \x00' ### end --- 361,364 ---- \x00\x00\x00\x06\x00\x00\x00\x01\x00\x08\x00\x00\x00C\x00\x00\x00\x06\x00\ \x00\x00\x01\x00\x08\x00\x00\x00D\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\ ! \x00' ### end From anthonybaxter at users.sourceforge.net Tue Jan 28 19:23:37 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Tue Jan 28 22:24:12 2003 Subject: [Spambayes-checkins] spambayes/spambayes CostCounter.py,1.2,1.3 Dibbler.py,1.2,1.3 FileCorpus.py,1.2,1.3 TestDriver.py,1.2,1.3 classifier.py,1.2,1.3 hammie.py,1.4,1.5 hammiebulk.py,1.4,1.5 tokenizer.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9465/spambayes Modified Files: CostCounter.py Dibbler.py FileCorpus.py TestDriver.py classifier.py hammie.py hammiebulk.py tokenizer.py Log Message: Whitespace n11n (if localization can be l10n, normalization can be n11n :) Index: CostCounter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/CostCounter.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** CostCounter.py 14 Jan 2003 05:38:20 -0000 1.2 --- CostCounter.py 29 Jan 2003 03:23:34 -0000 1.3 *************** *** 22,26 **** def spam(self, scr): for c in self.clients: ! c.spam(scr) def ham(self, scr): --- 22,26 ---- def spam(self, scr): for c in self.clients: ! c.spam(scr) def ham(self, scr): *************** *** 93,117 **** def __str__(self): ! return ("Total messages: %d; %d (%.1f%%) ham + %d (%.1f%%) spam\n"%( ! self._total, ! self._ham, zd(100.*self._ham,self._total), ! self._spam, zd(100.*self._spam,self._total))+ ! "Ham: %d (%.2f%%) ok, %d (%.2f%%) unsure, %d (%.2f%%) fp\n"%( ! self._correctham, zd(100.*self._correctham,self._ham), ! self._unsureham, zd(100.*self._unsureham,self._ham), ! self._fp, zd(100.*self._fp,self._ham))+ ! "Spam: %d (%.2f%%) ok, %d (%.2f%%) unsure, %d (%.2f%%) fn\n"%( ! self._correctspam, zd(100.*self._correctspam,self._spam), ! self._unsurespam, zd(100.*self._unsurespam,self._spam), ! self._fn, zd(100.*self._fn,self._spam))+ ! "Score False: %.2f%% Unsure %.2f%%"%( ! zd(100.*(self._fp+self._fn),self._total), ! zd(100.*self._unsure,self._total))) def zd(x,y): if y > 0: ! return x / y else: ! return 0 class StdCostCounter(CostCounter): --- 93,117 ---- def __str__(self): ! return ("Total messages: %d; %d (%.1f%%) ham + %d (%.1f%%) spam\n"%( ! self._total, ! self._ham, zd(100.*self._ham,self._total), ! self._spam, zd(100.*self._spam,self._total))+ ! "Ham: %d (%.2f%%) ok, %d (%.2f%%) unsure, %d (%.2f%%) fp\n"%( ! self._correctham, zd(100.*self._correctham,self._ham), ! self._unsureham, zd(100.*self._unsureham,self._ham), ! self._fp, zd(100.*self._fp,self._ham))+ ! "Spam: %d (%.2f%%) ok, %d (%.2f%%) unsure, %d (%.2f%%) fn\n"%( ! self._correctspam, zd(100.*self._correctspam,self._spam), ! self._unsurespam, zd(100.*self._unsurespam,self._spam), ! self._fn, zd(100.*self._fn,self._spam))+ ! "Score False: %.2f%% Unsure %.2f%%"%( ! zd(100.*(self._fp+self._fn),self._total), ! zd(100.*self._unsure,self._total))) def zd(x,y): if y > 0: ! return x / y else: ! return 0 class StdCostCounter(CostCounter): *************** *** 133,137 **** def _lambda(self, scr): if scr < options.ham_cutoff: ! return 0 elif scr > options.spam_cutoff: return 1 --- 133,137 ---- def _lambda(self, scr): if scr < options.ham_cutoff: ! return 0 elif scr > options.spam_cutoff: return 1 *************** *** 155,178 **** def default(): ! return CompositeCostCounter([ ! CountCostCounter(), ! StdCostCounter(), ! FlexCostCounter(), ! Flex2CostCounter(), ! DelayedCostCounter([ ! CountCostCounter(), ! StdCostCounter(), ! FlexCostCounter(), ! Flex2CostCounter(), ! ]) ! ]) def nodelay(): ! return CompositeCostCounter([ ! CountCostCounter(), ! StdCostCounter(), ! FlexCostCounter(), ! Flex2CostCounter(), ! ]) if __name__=="__main__": --- 155,178 ---- def default(): ! return CompositeCostCounter([ ! CountCostCounter(), ! StdCostCounter(), ! FlexCostCounter(), ! Flex2CostCounter(), ! DelayedCostCounter([ ! CountCostCounter(), ! StdCostCounter(), ! FlexCostCounter(), ! Flex2CostCounter(), ! ]) ! ]) def nodelay(): ! return CompositeCostCounter([ ! CountCostCounter(), ! StdCostCounter(), ! FlexCostCounter(), ! Flex2CostCounter(), ! ]) if __name__=="__main__": Index: Dibbler.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Dibbler.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** Dibbler.py 21 Jan 2003 18:23:43 -0000 1.2 --- Dibbler.py 29 Jan 2003 03:23:34 -0000 1.3 *************** *** 264,268 **** self.set_reuse_addr() if type(port) != type(()): ! port = ('', port) self.bind(port) self.listen(5) --- 264,268 ---- self.set_reuse_addr() if type(port) != type(()): ! port = ('', port) self.bind(port) self.listen(5) Index: FileCorpus.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/FileCorpus.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** FileCorpus.py 14 Jan 2003 05:38:20 -0000 1.2 --- FileCorpus.py 29 Jan 2003 03:23:34 -0000 1.3 *************** *** 119,123 **** for filename in os.listdir(directory): if fnmatch.fnmatch(filename, filter): ! self.msgs[filename] = None def makeMessage(self, key): --- 119,123 ---- for filename in os.listdir(directory): if fnmatch.fnmatch(filename, filter): ! self.msgs[filename] = None def makeMessage(self, key): *************** *** 216,223 **** except IOError, e: if e.errno != errno.ENOENT: ! raise else: ! self.setSubstance(fp.read()) ! fp.close() def store(self): --- 216,223 ---- except IOError, e: if e.errno != errno.ENOENT: ! raise else: ! self.setSubstance(fp.read()) ! fp.close() def store(self): *************** *** 508,512 **** except OSError, e: if e.errno != 3: # errno. ! raise else: for filename in os.listdir(dirname): --- 508,512 ---- except OSError, e: if e.errno != 3: # errno. ! raise else: for filename in os.listdir(dirname): *************** *** 553,562 **** for x in range(11): ! time.sleep(1) # make sure MSG00003 has expired ! if 10-x == 1: ! s = '' ! else: ! s = 's' ! print 'wait',10-x,'more second%s' % (s) m4 = fmClass('MSG00004', 'fctestunsurecorpus') --- 553,562 ---- for x in range(11): ! time.sleep(1) # make sure MSG00003 has expired ! if 10-x == 1: ! s = '' ! else: ! s = 's' ! print 'wait',10-x,'more second%s' % (s) m4 = fmClass('MSG00004', 'fctestunsurecorpus') *************** *** 727,730 **** else: print >>sys.stderr, __doc__ - - --- 727,728 ---- Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/TestDriver.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** TestDriver.py 14 Jan 2003 05:38:20 -0000 1.2 --- TestDriver.py 29 Jan 2003 03:23:34 -0000 1.3 *************** *** 22,26 **** # alldone() ! try: from sets import Set except ImportError: --- 22,26 ---- # alldone() ! try: from sets import Set except ImportError: *************** *** 29,33 **** import cPickle as pickle ! try: from heapq import heapreplace except ImportError: --- 29,33 ---- import cPickle as pickle ! try: from heapq import heapreplace except ImportError: *************** *** 200,205 **** def alldone(self): if options.show_histograms: ! besthamcut,bestspamcut = printhist("all runs:", ! self.global_ham_hist, self.global_spam_hist) else: --- 200,205 ---- def alldone(self): if options.show_histograms: ! besthamcut,bestspamcut = printhist("all runs:", ! self.global_ham_hist, self.global_spam_hist) else: Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/classifier.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** classifier.py 14 Jan 2003 05:38:20 -0000 1.2 --- classifier.py 29 Jan 2003 03:23:34 -0000 1.3 *************** *** 289,293 **** spamcount = record.spamcount hamcount = record.hamcount ! # Try the cache first try: --- 289,293 ---- spamcount = record.spamcount hamcount = record.hamcount ! # Try the cache first try: *************** *** 458,462 **** def _wordinfodel(self, word): del self.wordinfo[word] ! --- 458,462 ---- def _wordinfodel(self, word): del self.wordinfo[word] ! Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/hammie.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** hammie.py 22 Jan 2003 05:23:17 -0000 1.4 --- hammie.py 29 Jan 2003 03:23:34 -0000 1.5 *************** *** 18,22 **** This implements the basic functionality needed to score, filter, or ! train. """ --- 18,22 ---- This implements the basic functionality needed to score, filter, or ! train. """ Index: hammiebulk.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/hammiebulk.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** hammiebulk.py 28 Jan 2003 07:39:34 -0000 1.4 --- hammiebulk.py 29 Jan 2003 03:23:34 -0000 1.5 *************** *** 200,204 **** untrain(h, s, True) save = True ! if save: h.store() --- 200,204 ---- untrain(h, s, True) save = True ! if save: h.store() Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** tokenizer.py 17 Jan 2003 21:45:18 -0000 1.3 --- tokenizer.py 29 Jan 2003 03:23:35 -0000 1.4 *************** *** 13,17 **** import time import os ! try: from sets import Set except ImportError: --- 13,17 ---- import time import os ! try: from sets import Set except ImportError: *************** *** 1198,1202 **** addr = "".join(addr) all_addrs.append(addr.lower()) ! if len(all_addrs) > 1: # don't be fooled by "os.path." - commonprefix --- 1198,1202 ---- addr = "".join(addr) all_addrs.append(addr.lower()) ! if len(all_addrs) > 1: # don't be fooled by "os.path." - commonprefix From gward at users.sourceforge.net Wed Jan 29 17:41:55 2003 From: gward at users.sourceforge.net (Greg Ward) Date: Wed Jan 29 20:42:01 2003 Subject: [Spambayes-checkins] spambayes INTEGRATION.txt,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv20074 Modified Files: INTEGRATION.txt Log Message: Typo fix. Wording tweak. Index: INTEGRATION.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/INTEGRATION.txt,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** INTEGRATION.txt 17 Jan 2003 18:25:39 -0000 1.5 --- INTEGRATION.txt 30 Jan 2003 01:41:53 -0000 1.6 *************** *** 7,11 **** ------- ! Spambayes in a tool used to segregate unwanted (spam) mail from the mail you want (ham). Before Spambayes can be your spam filter of choice you need to train it on representative samples of email you receive. After it's been --- 7,11 ---- ------- ! Spambayes is a tool used to segregate unwanted mail (spam) from the mail you want (ham). Before Spambayes can be your spam filter of choice you need to train it on representative samples of email you receive. After it's been From montanaro at users.sourceforge.net Fri Jan 31 04:17:32 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Fri Jan 31 07:17:35 2003 Subject: [Spambayes-checkins] website related.ht,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv17273 Modified Files: related.ht Log Message: added ref to Gary Robinson's Spam Wiki (I can't install, however. It asks me for my password on shell1.sf.net but never accepts my SF password.) Index: related.ht =================================================================== RCS file: /cvsroot/spambayes/website/related.ht,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** related.ht 13 Jan 2003 08:08:58 -0000 1.7 --- related.ht 31 Jan 2003 12:17:29 -0000 1.8 *************** *** 7,10 **** --- 7,11 ----

      +
    • Gary Robinson has a well-organized Spam Wiki.
    • Gary Arnold's bayespam, a perl qmail filter.
    • As of version 1.3, Mozilla Mail now supports Graham-style Bayesian filtering, see the documentation on the mozilla website. From timstone4 at users.sourceforge.net Fri Jan 31 08:54:25 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Fri Jan 31 11:54:30 2003 Subject: [Spambayes-checkins] spambayes/spambayes dbExpImp.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv26640 Added Files: dbExpImp.py Log Message: New script to export and import spambayes databases. My testing indicates that this works just fine. Please backup databases until we're all convinced that this script is docile. --- NEW FILE: dbExpImp.py --- #! /usr/bin/env python """dbExpImp.py - Bayes database export/import Classes: Abstract: This utility has the primary function of exporting and importing a spambayes database into/from a flat file. This is useful in a number of scenarios. Platform portability of database - flat files can be exported and imported across platforms (winduhs and linux, for example) Database implementation changes - databases can survive database implementation upgrades or new database implementations. For example, if a dbm implementation changes between python x.y and python x.y+1... Database reorganization - an export followed by an import reorgs an existing database, improving performance, at least in some database implementations Database sharing - it is possible to distribute particular databases for research purposes, database sharing purposes, or for new users to have a 'seed' database to start with. Database merging - multiple databases can be merged into one quite easily by simply not specifying -n on an import. This will add the two database nham and nspams together (assuming the two databases do not share corpora) and for wordinfo conflicts, will add spamcount and hamcount together. Spambayes software release migration - an export can be executed before a release upgrade, as part of the installation script. Then, after the new software is installed, an import can be executed, which will effectively preserve existing training. This eliminates the need for retraining every time a release is installed. Others? I'm sure I haven't thought of everything... Usage: dbExpImp [options] options: -e : export -i : import -f: FN : flat file to export to or import from -d: FN : name of pickled database file to use -D: FN : name of dbm database file to use -m : merge import into an existing database file. This is meaningful only for import. If omitted, a new database file will be created. If specified, the imported wordinfo will be merged into an existing database. Run dbExpImp -h for more information. -h : help Examples: dbExpImp -e -d mybayes.db -f mybayes.db.export Exports pickled mybayes.db into mybayes.db.export as a csv flat file dbExpImp -i -D mybayes.db -f mybayes.db.export Imports mybayes.eb.export into a new DBM mybayes.db dbExpImp -e -i -n -d mybayes.db -f mybayes.db.export Exports then imports (reorganizes) new pickled mybayes.db dbExpImp -e -d abayes.db -f abayes.export dbExpImp -i -D abayes.db -f abayes.export Converts a bayes database from pickle to DBM dbExpImp -e -d abayes.db -f abayes.export dbExpImp -e -d bbayes.db -f bbayes.export dbExpImp -i -d newbayes.db -f abayes.export dbExpImp -i -m -d newbayes.db -f bbayes.export Creates a new database (newbayes.db) from two databases (abayes.db, bbayes.db) To Do: o Suggestions? """ # This module is part of the spambayes project, which is Copyright 2002 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Tim Stone " from __future__ import generators import storage import sys, os, getopt, errno, re import urllib def runExport(dbFN, useDBM, outFN): print "running export on %s" % (dbFN) if useDBM: bayes = storage.DBDictClassifier(dbFN) else: bayes = storage.PickledClassifier(dbFN) try: fp = open(outFN, 'w') except IOError, e: if e.errno != errno.ENOENT: raise nham = bayes.nham; nspam = bayes.nspam; print "nham %s, nspam %s" % (nham, nspam) fp.write("%s,%s,\n" % (nham, nspam)) for word in bayes.wordinfo: hamcount = bayes.wordinfo[word].hamcount spamcount = bayes.wordinfo[word].spamcount word = urllib.quote(word) fp.write("%s`%s`%s`\n" % (word, hamcount, spamcount)) fp.close() def runImport(dbFN, useDBM, newDBM, inFN): if newDBM: try: os.unlink(dbFN) except OSError, e: if e.errno != 2: # errno. raise if useDBM: bayes = storage.DBDictClassifier(dbFN) else: bayes = storage.PickledClassifier(dbFN) try: fp = open(inFN, 'r') except IOError, e: if e.errno != errno.ENOENT: raise nline = fp.readline() print nline (nham, nspam, junk) = re.split(',', nline) if newDBM: bayes.nham = nham bayes.nspam = nspam else: bayes.nham += nham bayes.nspam += nspam lines = fp.readlines() for line in lines: (word, hamcount, spamcount, junk) = re.split('`', line) word = urllib.unquote(word) try: wi = bayes.wordinfo[word] except KeyError: wi = bayes.WordInfoClass() wi.hamcount += int(hamcount) wi.spamcount += int(spamcount) bayes._wordinfoset(word, wi) fp.close() bayes.store() if __name__ == '__main__': try: opts, args = getopt.getopt(sys.argv[1:], 'iehmd:D:f:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() usePickle = False useDBM = False newDBM = True dbFN = None flatFN = None exp = False imp = False for opt, arg in opts: if opt == '-h': print >>sys.stderr, __doc__ sys.exit() elif opt == '-d': useDBM = False dbFN = arg elif opt == '-D': useDBM = True dbFN = arg elif opt == '-f': flatFN = arg elif opt == '-e': exp = True elif opt == '-i': imp = True elif opt == '-m': newDBM = False if (dbFN and flatFN): if exp: runExport(dbFN, useDBM, flatFN) if imp: runImport(dbFN, useDBM, newDBM, flatFN) else: print >>sys.stderr, __doc__ From richiehindle at users.sourceforge.net Fri Jan 31 10:29:51 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 31 13:29:55 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.48,1.49 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv15900 Modified Files: pop3proxy.py Log Message: Increase the stack size on MacOS X. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** pop3proxy.py 29 Jan 2003 03:23:34 -0000 1.48 --- pop3proxy.py 31 Jan 2003 18:29:48 -0000 1.49 *************** *** 150,153 **** --- 150,165 ---- from spambayes.Options import options + # Increase the stack size on MacOS X. Stolen from Lib/test/regrtest.py + if sys.platform == 'darwin': + try: + import resource + except ImportError: + pass + else: + soft, hard = resource.getrlimit(resource.RLIMIT_STACK) + newsoft = min(hard, max(soft, 1024*2048)) + resource.setrlimit(resource.RLIMIT_STACK, (newsoft, hard)) + + # HEADER_EXAMPLE is the longest possible header - the length of this one # is added to the size of each message. From richiehindle at users.sourceforge.net Fri Jan 31 10:32:30 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 31 13:32:34 2003 Subject: [Spambayes-checkins] spambayes/spambayes PyMeldLite.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv17740 Modified Files: PyMeldLite.py Log Message: Fix Unicode/ASCII conversion problems with high-bit-set characters. Index: PyMeldLite.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/PyMeldLite.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** PyMeldLite.py 28 Jan 2003 07:39:34 -0000 1.6 --- PyMeldLite.py 31 Jan 2003 18:32:27 -0000 1.7 *************** *** 489,496 **** # fool it here. def _mungeEntities(self, data): ! return re.sub(r'&(\w+);', r':PyMeldEntity:\1:', data) def _unmungeEntities(self, data): ! return re.sub(r':PyMeldEntity:(\w+):', r'&\1;', data) def feed(self, data): --- 489,496 ---- # fool it here. def _mungeEntities(self, data): ! return re.sub(r'&([A-Za-z0-9#]+);', r':PyMeldEntity:\1:', data) def _unmungeEntities(self, data): ! return re.sub(r':PyMeldEntity:([A-Za-z0-9#]+):', r'&\1;', data) def feed(self, data): From richiehindle at users.sourceforge.net Fri Jan 31 11:13:47 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 31 14:13:50 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.49,1.50 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv14036 Modified Files: pop3proxy.py Log Message: Correctly escape clues as they're written into the HTML, to avoid XML parser errors. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.49 retrieving revision 1.50 diff -C2 -d -r1.49 -r1.50 *** pop3proxy.py 31 Jan 2003 18:29:48 -0000 1.49 --- pop3proxy.py 31 Jan 2003 19:13:41 -0000 1.50 *************** *** 979,983 **** del cluesTable.cluesRow # Delete dummy row to make way for real ones for word, wordProb in clues: ! cluesTable += cluesRow % (word, wordProb) results = self.html.classifyResults.clone() --- 979,983 ---- del cluesTable.cluesRow # Delete dummy row to make way for real ones for word, wordProb in clues: ! cluesTable += cluesRow % (cgi.escape(word), wordProb) results = self.html.classifyResults.clone() *************** *** 998,1006 **** stats.spamprob = state.bayes.probability(wordinfo) else: ! stats = "%r does not exist in the database." % word query = self.html.wordQuery.clone() query.word.value = word ! statsBox = self._buildBox("Statistics for %r" % word, 'status.gif', stats) queryBox = self._buildBox("Word query", 'query.gif', query) --- 998,1006 ---- stats.spamprob = state.bayes.probability(wordinfo) else: ! stats = "%r does not exist in the database." % cgi.escape(word) query = self.html.wordQuery.clone() query.word.value = word ! statsBox = self._buildBox("Statistics for %r" % cgi.escape(word), 'status.gif', stats) queryBox = self._buildBox("Word query", 'query.gif', query) From richiehindle at users.sourceforge.net Fri Jan 31 11:59:54 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 31 14:59:58 2003 Subject: [Spambayes-checkins] spambayes/spambayes __init__.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv16019 Modified Files: __init__.py Log Message: Uprevved to 1.0a2 Index: __init__.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/__init__.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** __init__.py 17 Jan 2003 06:47:35 -0000 1.3 --- __init__.py 31 Jan 2003 19:59:52 -0000 1.4 *************** *** 1,3 **** # package marker. ! __version__ = '1.0a1' --- 1,3 ---- # package marker. ! __version__ = '1.0a2' From richiehindle at users.sourceforge.net Fri Jan 31 12:01:54 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 31 15:01:57 2003 Subject: [Spambayes-checkins] spambayes dbExpImp.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv17084 Added Files: dbExpImp.py Log Message: Moved this from the spambayes package to the scripts area. --- NEW FILE: dbExpImp.py --- #! /usr/bin/env python """dbExpImp.py - Bayes database export/import Classes: Abstract: This utility has the primary function of exporting and importing a spambayes database into/from a flat file. This is useful in a number of scenarios. Platform portability of database - flat files can be exported and imported across platforms (winduhs and linux, for example) Database implementation changes - databases can survive database implementation upgrades or new database implementations. For example, if a dbm implementation changes between python x.y and python x.y+1... Database reorganization - an export followed by an import reorgs an existing database, improving performance, at least in some database implementations Database sharing - it is possible to distribute particular databases for research purposes, database sharing purposes, or for new users to have a 'seed' database to start with. Database merging - multiple databases can be merged into one quite easily by simply not specifying -n on an import. This will add the two database nham and nspams together (assuming the two databases do not share corpora) and for wordinfo conflicts, will add spamcount and hamcount together. Spambayes software release migration - an export can be executed before a release upgrade, as part of the installation script. Then, after the new software is installed, an import can be executed, which will effectively preserve existing training. This eliminates the need for retraining every time a release is installed. Others? I'm sure I haven't thought of everything... Usage: dbExpImp [options] options: -e : export -i : import -f: FN : flat file to export to or import from -d: FN : name of pickled database file to use -D: FN : name of dbm database file to use -m : merge import into an existing database file. This is meaningful only for import. If omitted, a new database file will be created. If specified, the imported wordinfo will be merged into an existing database. Run dbExpImp -h for more information. -h : help Examples: dbExpImp -e -d mybayes.db -f mybayes.db.export Exports pickled mybayes.db into mybayes.db.export as a csv flat file dbExpImp -i -D mybayes.db -f mybayes.db.export Imports mybayes.eb.export into a new DBM mybayes.db dbExpImp -e -i -n -d mybayes.db -f mybayes.db.export Exports then imports (reorganizes) new pickled mybayes.db dbExpImp -e -d abayes.db -f abayes.export dbExpImp -i -D abayes.db -f abayes.export Converts a bayes database from pickle to DBM dbExpImp -e -d abayes.db -f abayes.export dbExpImp -e -d bbayes.db -f bbayes.export dbExpImp -i -d newbayes.db -f abayes.export dbExpImp -i -m -d newbayes.db -f bbayes.export Creates a new database (newbayes.db) from two databases (abayes.db, bbayes.db) To Do: o Suggestions? """ # This module is part of the spambayes project, which is Copyright 2002 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Tim Stone " from __future__ import generators import storage import sys, os, getopt, errno, re import urllib def runExport(dbFN, useDBM, outFN): print "running export on %s" % (dbFN) if useDBM: bayes = storage.DBDictClassifier(dbFN) else: bayes = storage.PickledClassifier(dbFN) try: fp = open(outFN, 'w') except IOError, e: if e.errno != errno.ENOENT: raise nham = bayes.nham; nspam = bayes.nspam; print "nham %s, nspam %s" % (nham, nspam) fp.write("%s,%s,\n" % (nham, nspam)) for word in bayes.wordinfo: hamcount = bayes.wordinfo[word].hamcount spamcount = bayes.wordinfo[word].spamcount word = urllib.quote(word) fp.write("%s`%s`%s`\n" % (word, hamcount, spamcount)) fp.close() def runImport(dbFN, useDBM, newDBM, inFN): if newDBM: try: os.unlink(dbFN) except OSError, e: if e.errno != 2: # errno. raise if useDBM: bayes = storage.DBDictClassifier(dbFN) else: bayes = storage.PickledClassifier(dbFN) try: fp = open(inFN, 'r') except IOError, e: if e.errno != errno.ENOENT: raise nline = fp.readline() print nline (nham, nspam, junk) = re.split(',', nline) if newDBM: bayes.nham = nham bayes.nspam = nspam else: bayes.nham += nham bayes.nspam += nspam lines = fp.readlines() for line in lines: (word, hamcount, spamcount, junk) = re.split('`', line) word = urllib.unquote(word) try: wi = bayes.wordinfo[word] except KeyError: wi = bayes.WordInfoClass() wi.hamcount += int(hamcount) wi.spamcount += int(spamcount) bayes._wordinfoset(word, wi) fp.close() bayes.store() if __name__ == '__main__': try: opts, args = getopt.getopt(sys.argv[1:], 'iehmd:D:f:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() usePickle = False useDBM = False newDBM = True dbFN = None flatFN = None exp = False imp = False for opt, arg in opts: if opt == '-h': print >>sys.stderr, __doc__ sys.exit() elif opt == '-d': useDBM = False dbFN = arg elif opt == '-D': useDBM = True dbFN = arg elif opt == '-f': flatFN = arg elif opt == '-e': exp = True elif opt == '-i': imp = True elif opt == '-m': newDBM = False if (dbFN and flatFN): if exp: runExport(dbFN, useDBM, flatFN) if imp: runImport(dbFN, useDBM, newDBM, flatFN) else: print >>sys.stderr, __doc__ From richiehindle at users.sourceforge.net Fri Jan 31 12:03:00 2003 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Fri Jan 31 15:28:50 2003 Subject: [Spambayes-checkins] spambayes/spambayes dbExpImp.py,1.1,NONE Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv17595 Removed Files: dbExpImp.py Log Message: Moved this from the spambayes package to the scripts area. --- dbExpImp.py DELETED ---