From montanaro@users.sourceforge.net Tue Oct 1 01:55:39 2002 From: montanaro@users.sourceforge.net (Skip Montanaro) Date: Mon, 30 Sep 2002 17:55:39 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.40,1.41 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv14518 Modified Files: Options.py Log Message: adjust comment for generate_long_skips option Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** Options.py 30 Sep 2002 21:56:27 -0000 1.40 --- Options.py 1 Oct 2002 00:55:37 -0000 1.41 *************** *** 93,100 **** mine_received_headers: False ! # If your ham corpus is generated from sources which contain few, if any ! # attachments you probably want to leave this alone. If you have many ! # legitimate correspondents who send you attachments (Excel spreadsheets, ! # etc), you might want to set this to False. generate_long_skips: True --- 93,101 ---- mine_received_headers: False ! # If legitimate mail contains things that look like text to the tokenizer ! # and turning turning off this option helps (perhaps binary attachments get ! # 'defanged' by something upstream from this operation and thus look like ! # text), this may help, and should be an alert that perhaps the tokenizer is ! # broken. generate_long_skips: True From nascheme@users.sourceforge.net Tue Oct 1 02:31:42 2002 From: nascheme@users.sourceforge.net (Neil Schemenauer) Date: Mon, 30 Sep 2002 18:31:42 -0700 Subject: [Spambayes-checkins] spambayes classifier.py,1.25,1.26 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv27379 Modified Files: classifier.py Log Message: Allow spamprob to be passed to the WordInfo constructor. Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** classifier.py 29 Sep 2002 18:03:39 -0000 1.25 --- classifier.py 1 Oct 2002 01:31:40 -0000 1.26 *************** *** 57,64 **** # a word is no longer being used, it's just wasting space. ! def __init__(self, atime): self.atime = atime self.spamcount = self.hamcount = self.killcount = 0 ! self.spamprob = None def __repr__(self): --- 57,64 ---- # a word is no longer being used, it's just wasting space. ! def __init__(self, atime, spamprob=None): self.atime = atime self.spamcount = self.hamcount = self.killcount = 0 ! self.spamprob = spamprob def __repr__(self): From nascheme@users.sourceforge.net Tue Oct 1 02:33:46 2002 From: nascheme@users.sourceforge.net (Neil Schemenauer) Date: Mon, 30 Sep 2002 18:33:46 -0700 Subject: [Spambayes-checkins] spambayes neilfilter.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv28286 Modified Files: neilfilter.py Log Message: Update to work with the "Robinson" classifer. Reuse the code from the classifier module by using a Cdb wrapper class that creates the WordInfo objects. Index: neilfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/neilfilter.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** neilfilter.py 20 Sep 2002 03:14:42 -0000 1.2 --- neilfilter.py 1 Oct 2002 01:33:44 -0000 1.3 *************** *** 10,19 **** import socket import email - from heapq import heapreplace - from sets import Set import cdb from tokenizer import tokenize ! from classifier import MIN_SPAMPROB, MAX_SPAMPROB, UNKNOWN_SPAMPROB, \ ! MAX_DISCRIMINATORS program = sys.argv[0] # For usage(); referenced by docstring above --- 10,16 ---- import socket import email import cdb from tokenizer import tokenize ! import classifier program = sys.argv[0] # For usage(); referenced by docstring above *************** *** 21,99 **** BLOCK_SIZE = 10000 SIZE_LIMIT = 5000000 # messages larger are not analyzed ! SPAM_THRESHOLD = 0.9 ! ! def spamprob(wordprobs, wordstream): ! """Return best-guess probability that wordstream is spam. ! ! wordprobs is a CDB of word probabilities ! ! wordstream is an iterable object producing words. ! The return value is a float in [0.0, 1.0]. ! """ ! ! # A priority queue to remember the MAX_DISCRIMINATORS best ! # probabilities, where "best" means largest distance from 0.5. ! # The tuples are (distance, prob, word). ! nbest = [(-1.0, None, None)] * MAX_DISCRIMINATORS ! smallest_best = -1.0 ! ! mins = [] # all words w/ prob MIN_SPAMPROB ! maxs = [] # all words w/ prob MAX_SPAMPROB ! # Counting a unique word multiple times hurts, although counting one ! # at most two times had some benefit whan UNKNOWN_SPAMPROB was 0.2. ! # When that got boosted to 0.5, counting more than once became ! # counterproductive. ! for word in Set(wordstream): ! prob = float(wordprobs.get(word, UNKNOWN_SPAMPROB)) ! distance = abs(prob - 0.5) ! if prob == MIN_SPAMPROB: ! mins.append((distance, prob, word)) ! elif prob == MAX_SPAMPROB: ! maxs.append((distance, prob, word)) ! elif distance > smallest_best: ! # Subtle: we didn't use ">" instead of ">=" just to save ! # calls to heapreplace(). The real intent is that if ! # there are many equally strong indicators throughout the ! # message, we want to favor the ones that appear earliest: ! # it's expected that spam headers will often have smoking ! # guns, and, even when not, spam has to grab your attention ! # early (& note that when spammers generate large blocks of ! # random gibberish to throw off exact-match filters, it's ! # always at the end of the msg -- if they put it at the ! # start, *nobody* would read the msg). ! heapreplace(nbest, (distance, prob, word)) ! smallest_best = nbest[0][0] ! ! # Compute the probability. Note: This is what Graham's code did, ! # but it's dubious for reasons explained in great detail on Python- ! # Dev: it's missing P(spam) and P(not-spam) adjustments that ! # straightforward Bayesian analysis says should be here. It's ! # unclear how much it matters, though, as the omissions here seem ! # to tend in part to cancel out distortions introduced earlier by ! # HAMBIAS. Experiments will decide the issue. ! # First cancel out competing extreme clues (see comment block at ! # MAX_DISCRIMINATORS declaration -- this is a twist on Graham). ! if mins or maxs: ! if len(mins) < len(maxs): ! shorter, longer = mins, maxs else: ! shorter, longer = maxs, mins ! tokeep = min(len(longer) - len(shorter), MAX_DISCRIMINATORS) ! # They're all good clues, but we're only going to feed the tokeep ! # initial clues from the longer list into the probability ! # computation. ! for x in longer[:tokeep]: ! heapreplace(nbest, x) ! ! prob_product = inverse_prob_product = 1.0 ! for distance, prob, word in nbest: ! if prob is None: # it's one of the dummies nbest started with ! continue ! prob_product *= prob ! inverse_prob_product *= 1.0 - prob ! ! prob = prob_product / (prob_product + inverse_prob_product) ! return prob def maketmp(dir): --- 18,37 ---- BLOCK_SIZE = 10000 SIZE_LIMIT = 5000000 # messages larger are not analyzed ! SPAM_CUTOFF = 0.57 ! class CdbWrapper(cdb.Cdb): ! def get(self, key, default=None, ! cdb_get=cdb.Cdb.get, ! WordInfo=classifier.WordInfo): ! prob = cdb_get(self, key, default) ! if prob is None: ! return None else: ! return WordInfo(0, float(prob)) ! ! class CdbBayes(classifier.Bayes): ! def __init__(self, cdbfile): ! classifier.Bayes.__init__(self) ! self.wordinfo = CdbWrapper(cdbfile) def maketmp(dir): *************** *** 156,165 **** msg = email.message_from_string(msgdata) del msgdata ! wordprobs = cdb.Cdb(open(wordprobfilename, 'rb')) ! prob = spamprob(wordprobs, tokenize(msg)) else: prob = 0.0 ! if prob > SPAM_THRESHOLD: os.rename(pathname, "%s/new/%s" % (spamdir, filename)) else: --- 94,103 ---- msg = email.message_from_string(msgdata) del msgdata ! bayes = CdbBayes(open(wordprobfilename, 'rb')) ! prob = bayes.spamprob(tokenize(msg)) else: prob = 0.0 ! if prob > SPAM_CUTOFF: os.rename(pathname, "%s/new/%s" % (spamdir, filename)) else: From npickett@users.sourceforge.net Tue Oct 1 16:07:48 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Tue, 01 Oct 2002 08:07:48 -0700 Subject: [Spambayes-checkins] spambayes hammie.py,1.26,1.27 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv29544 Modified Files: hammie.py Log Message: * Uses options.spam_cutoff now, instead of hard-coded 0.9. (Thanks to Richie Hindle for the heads up while I was dealing with a RL intrusion) * Grammar fix :) Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammie.py,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** hammie.py 27 Sep 2002 22:38:53 -0000 1.26 --- hammie.py 1 Oct 2002 15:07:45 -0000 1.27 *************** *** 44,47 **** --- 44,48 ---- import mboxutils import classifier + from Options import options program = sys.argv[0] # For usage(); referenced by docstring above *************** *** 54,58 **** # Probability at which a message is considered spam ! SPAM_THRESHOLD = 0.9 # Tim's tokenizer kicks far more booty than anything I would have --- 55,59 ---- # Probability at which a message is considered spam ! SPAM_THRESHOLD = options.spam_cutoff # Tim's tokenizer kicks far more booty than anything I would have *************** *** 140,149 **** """A persistent Bayes classifier. ! This is just like classifier.Bayes, except that the dictionary ! is a database. You take less disk this way, I think, and you can ! pretend it's persistent. It's much slower training, but much faster ! checking, and takes less memory all around. ! On destruction, an instantiation of this class will write it's state to a special key. When you instantiate a new one, it will attempt to read these values out of that key again, so you can pick up where --- 141,151 ---- """A persistent Bayes classifier. ! This is just like classifier.Bayes, except that the dictionary is a ! database. You take less disk this way and you can pretend it's ! persistent. The tradeoffs vs. a pickle are: 1. it's slower ! training, but faster checking, and 2. it needs less memory to run, ! but takes more space on the hard drive. ! On destruction, an instantiation of this class will write its state to a special key. When you instantiate a new one, it will attempt to read these values out of that key again, so you can pick up where From npickett@users.sourceforge.net Tue Oct 1 18:53:56 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Tue, 01 Oct 2002 10:53:56 -0700 Subject: [Spambayes-checkins] spambayes runtest.sh,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv24576 Modified Files: runtest.sh Log Message: * Took out old tests nobody needs to run anymore. If you want them, get an older revision :) Index: runtest.sh =================================================================== RCS file: /cvsroot/spambayes/spambayes/runtest.sh,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** runtest.sh 27 Sep 2002 19:40:22 -0000 1.4 --- runtest.sh 1 Oct 2002 17:53:54 -0000 1.5 *************** *** 50,96 **** python cmp.py run1s run2s | tee results.txt ;; - robinson1) - # This test requires you have an appropriately-modified - # Tester.py.new and classifier.py.new as detailed in - # - - python timcv.py -n $SETS > run1.txt - - mv Tester.py Tester.py.orig - cp Tester.py.new Tester.py - mv classifier.py classifier.py.orig - cp classifier.py.new classifier.py - python timcv.py -n $SETS > run2.txt - - python rates.py run1 run2 > runrates.txt - - python cmp.py run1s run2s | tee results.txt - - mv Tester.py.orig Tester.py - mv classifier.py.orig classifier.py - ;; - mass) - ## Tim took this code out, don't run this test. I'm leaving - ## this stuff in here for the time being so I can refer to it - ## later when I need to do this sort of thing again :) - - # Clear out .ini file - rm -f bayescustomize.ini - # Run 1 - python timcv.py -n $SETS > run1.txt - # New .ini file - cat > bayescustomize.ini < run2.txt - # Generate rates - python rates.py run1 run2 > runrates.txt - # Compare rates - python cmp.py run1s run2s | tee results.txt - ;; esac --- 50,52 ---- From tim_one@users.sourceforge.net Wed Oct 2 17:05:29 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 02 Oct 2002 09:05:29 -0700 Subject: [Spambayes-checkins] spambayes neilfilter.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv1262 Modified Files: neilfilter.py Log Message: Whitespace normalization. Index: neilfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/neilfilter.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** neilfilter.py 1 Oct 2002 01:33:44 -0000 1.3 --- neilfilter.py 2 Oct 2002 16:05:27 -0000 1.4 *************** *** 29,33 **** else: return WordInfo(0, float(prob)) ! class CdbBayes(classifier.Bayes): def __init__(self, cdbfile): --- 29,33 ---- else: return WordInfo(0, float(prob)) ! class CdbBayes(classifier.Bayes): def __init__(self, cdbfile): From tim_one@users.sourceforge.net Fri Oct 4 03:29:22 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 03 Oct 2002 19:29:22 -0700 Subject: [Spambayes-checkins] spambayes Histogram.py,NONE,1.1 TestDriver.py,1.18,1.19 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv17494 Modified Files: TestDriver.py Added Files: Histogram.py Log Message: Split the histogram class into its own file, greatly robustified the numerics, and made it a lot more flexible and memory-consuming . This is to help make progress on the central-limit scheme, where we still have a poor idea of what the zscore distributions look like. The histogram class is flexible enough to give us nice pictures of that now. Note that new min, max, and median statistics are displayed for all histograms now (and computing percentile cutoffs would be easy to add). Note that if you have a histogram object, you can now pass the # of buckets desired to the display() method (no code exploits that yet, but it means, e.g., that if you discover you really wanted more buckets, you potentially don't have to rerun the test). HistToGNU.py in particular may be able to exploit that immediately. --- NEW FILE: Histogram.py --- import math from Options import options class Hist: """Simple histograms of float values.""" # Pass None for lo and hi and it will automatically adjust to the min # and max values seen. # Note: nbuckets can be passed for backward compatibility. The # display() method can be passed a different nbuckets value. def __init__(self, nbuckets=options.nbuckets, lo=0.0, hi=100.0): self.lo, self.hi = lo, hi self.nbuckets = nbuckets self.buckets = [0] * nbuckets self.data = [] # the raw data points self.stats_uptodate = False # Add a value to the collection. def add(self, x): self.data.append(x) self.stats_uptodate = False # Compute, and set as instance attrs: # n # of data points # The rest are set iff n>0: # min smallest value in collection # max largest value in collection # median midpoint # mean # var variance # sdev population standard deviation (sqrt(variance)) # self.data is also sorted. def compute_stats(self): if self.stats_uptodate: return stats_uptodate = True data = self.data n = self.n = len(data) if n == 0: return data.sort() self.min = data[0] self.max = data[-1] if n & 1: self.median = data[n // 2] else: self.median = (data[n // 2] + data[(n-1) // 2]) / 2.0 # Compute mean. # Add in increasing order of magnitude, to minimize roundoff error. if data[0] < 0.0: temp = [(abs(x), x) for x in data] temp.sort() data = [x[1] for x in temp] del temp sum = 0.0 for x in data: sum += x mean = self.mean = sum / n # Compute variance. var = 0.0 for x in data: d = x - mean var += d*d self.var = var / n self.sdev = math.sqrt(self.var) # Merge other into self. def __iadd__(self, other): self.data.extend(other.data) self.stats_uptodate = False return self # Print a histogram to stdout. # Also sets instance var nbuckets to the # of buckets, and # buckts to a list of nbuckets counts, but only if at least one # data point is in the collection. def display(self, nbuckets=None, WIDTH=61): if nbuckets <= 0: raise ValueError("nbuckets %g > 0 required" % nbuckets) self.compute_stats() n = self.n if n == 0: return print "%d items; mean %.2f; sdev %.2f" % (n, self.mean, self.sdev) print "-> min %g; median %g; max %g" % (self.min, self.median, self.max) if nbuckets is None: nbuckets = self.nbuckets self.nbuckets = nbuckets self.buckets = buckets = [0] * nbuckets lo, hi = self.lo, self.hi if lo is None: lo = self.min if hi is None: hi = self.max if lo > hi: return # Compute bucket counts. span = float(hi - lo) bucketwidth = span / nbuckets for x in self.data: i = int((x - lo) / bucketwidth) if i >= nbuckets: i = nbuckets - 1 elif i < 0: i = 0 buckets[i] += 1 # hunit is how many items a * represents. A * is printed for # each hunit items, plus any non-zero fraction thereof. biggest = max(self.buckets) hunit, r = divmod(biggest, WIDTH) if r: hunit += 1 print "* =", hunit, "items" # We need ndigits decimal digits to display the largest bucket count. ndigits = len(str(biggest)) # Displaying the bucket boundaries is more troublesome. For now, # just print one digit after the decimal point, regardless of what # the boundaries look like. boundary_digits = max(len(str(int(lo))), len(str(int(hi)))) format = "%" + str(boundary_digits + 2) + '.1f %' + str(ndigits) + "d" for i in range(nbuckets): n = self.buckets[i] print format % (lo + i * bucketwidth, n), print '*' * ((n + hunit - 1) // hunit) Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -d -r1.18 -r1.19 *** TestDriver.py 28 Sep 2002 03:44:15 -0000 1.18 --- TestDriver.py 4 Oct 2002 02:29:20 -0000 1.19 *************** *** 29,102 **** import Tester import classifier ! class Hist: ! """Simple histograms of float values in [0.0, 1.0].""" ! ! def __init__(self, nbuckets=20): ! self.buckets = [0] * nbuckets ! self.nbuckets = nbuckets ! self.n = 0 # number of data points ! self.sum = 0.0 # sum of their values ! self.sumsq = 0.0 # sum of their squares ! ! def add(self, x): ! n = self.nbuckets ! i = int(n * x) ! if i >= n: ! i = n-1 ! self.buckets[i] += 1 ! ! self.n += 1 ! x *= 100.0 ! self.sum += x ! self.sumsq += x*x ! ! def __iadd__(self, other): ! if self.nbuckets != other.nbuckets: ! raise ValueError('bucket size mismatch') ! for i in range(self.nbuckets): ! self.buckets[i] += other.buckets[i] ! self.n += other.n ! self.sum += other.sum ! self.sumsq += other.sumsq ! return self ! ! def display(self, WIDTH=61): ! from math import sqrt ! if self.n > 0: ! mean = self.sum / self.n ! var = self.sumsq / self.n - mean**2 ! # The vagaries of f.p. rounding can make var come out negative. ! # There are ways to fix that, but they're too painful for this ! # part of the code to endure. ! if var < 0.0: ! var = 0.0 ! print "%d items; mean %.2f; sdev %.2f" % (self.n, mean, sqrt(var)) ! ! biggest = max(self.buckets) ! hunit, r = divmod(biggest, WIDTH) ! if r: ! hunit += 1 ! print "* =", hunit, "items" ! ! ndigits = len(str(biggest)) ! format = "%5.1f %" + str(ndigits) + "d" ! ! for i in range(len(self.buckets)): ! n = self.buckets[i] ! print format % (100.0 * i / self.nbuckets, n), ! print '*' * ((n + hunit - 1) // hunit) ! ! def printhist(tag, ham, spam): print print "-> Ham scores for", tag, ! ham.display() print print "-> Spam scores for", tag, ! spam.display() if not options.compute_best_cutoffs_from_histograms: return # Figure out "the best" spam cutoff point, meaning the one that minimizes --- 29,47 ---- import Tester import classifier + from Histogram import Hist ! def printhist(tag, ham, spam, nbuckets=options.nbuckets): print print "-> Ham scores for", tag, ! ham.display(nbuckets) print print "-> Spam scores for", tag, ! spam.display(nbuckets) if not options.compute_best_cutoffs_from_histograms: return + if ham.n == 0 or spam.n == 0: + return # Figure out "the best" spam cutoff point, meaning the one that minimizes *************** *** 112,116 **** best_total = fpw * fp + fn bests = [(0, fp, fn)] ! for i in range(ham.nbuckets): # When moving the cutoff beyond bucket i, the ham in bucket i # are redeemed, and the spam in bucket i become false negatives. --- 57,61 ---- best_total = fpw * fp + fn bests = [(0, fp, fn)] ! for i in range(nbuckets): # When moving the cutoff beyond bucket i, the ham in bucket i # are redeemed, and the spam in bucket i become false negatives. *************** *** 127,131 **** i, fp, fn = bests.pop(0) ! print '-> best cutoff for', tag, float(i) / ham.nbuckets print '-> with weighted total %g*%d fp + %d fn = %g' % ( fpw, fp, fn, best_total) --- 72,76 ---- i, fp, fn = bests.pop(0) ! print '-> best cutoff for', tag, float(i) / nbuckets print '-> with weighted total %g*%d fp + %d fn = %g' % ( fpw, fp, fn, best_total) *************** *** 155,160 **** self.falsepos = Set() self.falseneg = Set() ! self.global_ham_hist = Hist(options.nbuckets) ! self.global_spam_hist = Hist(options.nbuckets) self.ntimes_finishtest_called = 0 self.new_classifier() --- 100,105 ---- self.falsepos = Set() self.falseneg = Set() ! self.global_ham_hist = Hist() ! self.global_spam_hist = Hist() self.ntimes_finishtest_called = 0 self.new_classifier() *************** *** 163,168 **** c = self.classifier = classifier.Bayes() self.tester = Tester.Test(c) ! self.trained_ham_hist = Hist(options.nbuckets) ! self.trained_spam_hist = Hist(options.nbuckets) # CAUTION: this just doesn't work for incrememental training when --- 108,113 ---- c = self.classifier = classifier.Bayes() self.tester = Tester.Test(c) ! self.trained_ham_hist = Hist() ! self.trained_spam_hist = Hist() # CAUTION: this just doesn't work for incrememental training when *************** *** 192,197 **** self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist ! self.trained_ham_hist = Hist(options.nbuckets) ! self.trained_spam_hist = Hist(options.nbuckets) self.ntimes_finishtest_called += 1 --- 137,142 ---- self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist ! self.trained_ham_hist = Hist() ! self.trained_spam_hist = Hist() self.ntimes_finishtest_called += 1 *************** *** 220,229 **** c = self.classifier t = self.tester ! local_ham_hist = Hist(options.nbuckets) ! local_spam_hist = Hist(options.nbuckets) def new_ham(msg, prob, lo=options.show_ham_lo, hi=options.show_ham_hi): ! local_ham_hist.add(prob) if lo <= prob <= hi: print --- 165,174 ---- c = self.classifier t = self.tester ! local_ham_hist = Hist() ! local_spam_hist = Hist() def new_ham(msg, prob, lo=options.show_ham_lo, hi=options.show_ham_hi): ! local_ham_hist.add(prob * 100.0) if lo <= prob <= hi: print *************** *** 234,238 **** def new_spam(msg, prob, lo=options.show_spam_lo, hi=options.show_spam_hi): ! local_spam_hist.add(prob) if lo <= prob <= hi: print --- 179,183 ---- def new_spam(msg, prob, lo=options.show_spam_lo, hi=options.show_spam_hi): ! local_spam_hist.add(prob * 100.0) if lo <= prob <= hi: print From tim_one@users.sourceforge.net Fri Oct 4 03:44:47 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 03 Oct 2002 19:44:47 -0700 Subject: [Spambayes-checkins] spambayes Histogram.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv23744 Modified Files: Histogram.py Log Message: Braino repair in compute_stats(). Index: Histogram.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Histogram.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** Histogram.py 4 Oct 2002 02:29:20 -0000 1.1 --- Histogram.py 4 Oct 2002 02:44:45 -0000 1.2 *************** *** 35,39 **** if self.stats_uptodate: return ! stats_uptodate = True data = self.data n = self.n = len(data) --- 35,39 ---- if self.stats_uptodate: return ! self.stats_uptodate = True data = self.data n = self.n = len(data) From tim_one@users.sourceforge.net Fri Oct 4 04:01:32 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 03 Oct 2002 20:01:32 -0700 Subject: [Spambayes-checkins] spambayes HistToGNU.py,1.6,1.7 Histogram.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv29721 Modified Files: HistToGNU.py Histogram.py Log Message: Ack, I'm sure my histogram refactoring broke HistToGNU.py, but can't test it conveniently. Lots of fiddling so that it's no longer obviously broken. Index: HistToGNU.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/HistToGNU.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** HistToGNU.py 27 Sep 2002 21:04:05 -0000 1.6 --- HistToGNU.py 4 Oct 2002 03:01:29 -0000 1.7 *************** *** 41,46 **** return pickle.load(file(path)) ! def outputHist(hist,f=sys.stdout): """Output the Hist object to file f""" for i in range(len(hist.buckets)): n = hist.buckets[i] --- 41,47 ---- return pickle.load(file(path)) ! def outputHist(hist, f=sys.stdout): """Output the Hist object to file f""" + hist.fill_buckets() for i in range(len(hist.buckets)): n = hist.buckets[i] *************** *** 67,72 **** try: ! opts, args = getopt.getopt(sys.argv[1:], '', ! []) except getopt.error, msg: usage(1, msg) --- 68,72 ---- try: ! opts, args = getopt.getopt(sys.argv[1:], '', []) except getopt.error, msg: usage(1, msg) Index: Histogram.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Histogram.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** Histogram.py 4 Oct 2002 02:44:45 -0000 1.2 --- Histogram.py 4 Oct 2002 03:01:30 -0000 1.3 *************** *** 72,75 **** --- 72,110 ---- return self + def get_lo_hi(self): + self.compute_stats() + lo, hi = self.lo, self.hi + if lo is None: + lo = self.min + if hi is None: + hi = self.max + return lo, hi + + def get_bucketwidth(self): + lo, hi = self.get_lo_hi() + span = float(hi - lo) + return span / self.nbuckets + + # Set instance var nbuckets to the # of buckets, and buckets to a list + # of nbuckets counts. + def fill_buckets(self, nbuckets=None): + if nbuckets is None: + nbuckets = self.nbuckets + if nbuckets <= 0: + raise ValueError("nbuckets %g > 0 required" % nbuckets) + self.nbuckets = nbuckets + self.buckets = buckets = [0] * nbuckets + + # Compute bucket counts. + lo, hi = self.get_lo_hi() + bucketwidth = self.get_bucketwidth() + for x in self.data: + i = int((x - lo) / bucketwidth) + if i >= nbuckets: + i = nbuckets - 1 + elif i < 0: + i = 0 + buckets[i] += 1 + # Print a histogram to stdout. # Also sets instance var nbuckets to the # of buckets, and *************** *** 87,116 **** self.median, self.max) ! if nbuckets is None: ! nbuckets = self.nbuckets ! self.nbuckets = nbuckets ! self.buckets = buckets = [0] * nbuckets ! ! lo, hi = self.lo, self.hi ! if lo is None: ! lo = self.min ! if hi is None: ! hi = self.max if lo > hi: return - # Compute bucket counts. - span = float(hi - lo) - bucketwidth = span / nbuckets - for x in self.data: - i = int((x - lo) / bucketwidth) - if i >= nbuckets: - i = nbuckets - 1 - elif i < 0: - i = 0 - buckets[i] += 1 - # hunit is how many items a * represents. A * is printed for # each hunit items, plus any non-zero fraction thereof. biggest = max(self.buckets) hunit, r = divmod(biggest, WIDTH) --- 122,132 ---- self.median, self.max) ! lo, hi = self.get_lo_hi() if lo > hi: return # hunit is how many items a * represents. A * is printed for # each hunit items, plus any non-zero fraction thereof. + self.fill_buckets(nbuckets) biggest = max(self.buckets) hunit, r = divmod(biggest, WIDTH) *************** *** 128,131 **** --- 144,148 ---- format = "%" + str(boundary_digits + 2) + '.1f %' + str(ndigits) + "d" + bucketwidth = self.get_bucketwidth() for i in range(nbuckets): n = self.buckets[i] From tim_one@users.sourceforge.net Fri Oct 4 19:17:51 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 04 Oct 2002 11:17:51 -0700 Subject: [Spambayes-checkins] spambayes classifier.py,1.26,1.27 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv2805 Modified Files: classifier.py Log Message: Changed the central-limit schemes to produce two "pseudo clues" at the front of the clue list: prob('*zham*') = -75.8654 prob('*zspam*') = -48.3459 This is an easy way to get the zscores back to the testing framework without changing gads of interfaces. Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** classifier.py 1 Oct 2002 01:31:40 -0000 1.26 --- classifier.py 4 Oct 2002 18:17:48 -0000 1.27 *************** *** 455,458 **** --- 455,460 ---- clues = [(word, prob) for prob, word, record in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) + clues.insert(0, ('*zspam*', zspam)) + clues.insert(0, ('*zham*', zham)) return stat, clues else: *************** *** 544,547 **** --- 546,551 ---- clues = [(word, prob) for prob, word, record in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) + clues.insert(0, ('*zspam*', zspam)) + clues.insert(0, ('*zham*', zham)) return stat, clues else: From tim_one@users.sourceforge.net Fri Oct 4 19:37:54 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 04 Oct 2002 11:37:54 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.41,1.42 classifier.py,1.27,1.28 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv10764 Modified Files: Options.py classifier.py Log Message: Check in enough stuff so that bold experimenters can at least *try* the central-limit code without crazy results. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** Options.py 1 Oct 2002 00:55:37 -0000 1.41 --- Options.py 4 Oct 2002 18:37:52 -0000 1.42 *************** *** 218,221 **** --- 218,233 ---- # probability complements (p and 1-p) instead. use_central_limit2: False + + # For now, a central-limit scheme considers its decision "certain" if the + # ratio of the zscore with larger magnitude to the zscore with smaller + # magnitude exceeds zscore_ratio_cutoff. The value here is seat-of-the- + # pants for use_central_limit2; nothing is known about use_central_limit wrt + # this. + # For now, a central-limit scheme delivers just one of 4 scores: + # 0.00 -- certain it's ham + # 0.49 -- guesses ham but is unsure + # 0.51 -- guesses spam but is unsure + # 1.00 -- certain it's spam + zscore_ratio_cutoff: 1.9 """ *************** *** 264,267 **** --- 276,280 ---- 'use_central_limit': boolean_cracker, 'use_central_limit2': boolean_cracker, + 'zscore_ratio_cutoff': float_cracker, }, } Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** classifier.py 4 Oct 2002 18:17:48 -0000 1.27 --- classifier.py 4 Oct 2002 18:37:52 -0000 1.28 *************** *** 442,454 **** zham = (mean - self.hammean) / sqrt(self.hamvar / n) zspam = (mean - self.spammean) / sqrt(self.spamvar / n) ! stat = abs(zham) - abs(zspam) # > 0 for spam, < 0 for ham ! # Normalize into [0, 1]. I'm arbitrarily clipping it to fit in ! # [-20, 20] first. 20 is a massive z-score difference. ! if stat < -20.0: ! stat = -20.0 ! elif stat > 20.0: ! stat = 20.0 ! stat = 0.5 + stat / 40.0 if evidence: --- 442,458 ---- zham = (mean - self.hammean) / sqrt(self.hamvar / n) zspam = (mean - self.spammean) / sqrt(self.spamvar / n) ! delta = abs(zham) - abs(zspam) # > 0 for spam, < 0 for ham ! azham, azspam = abs(zham), abs(zspam) ! if azham < azspam: ! ratio = azspam / max(azham, 1e-10) # guard against 0 division ! else: ! ratio = azham / max(azspam, 1e-10) # guard against 0 division ! certain = ratio > options.zscore_ratio_cutoff ! ! if certain: ! score = delta > 0.0 and 1.0 or 0.0 ! else: ! score = delta > 0.0 and 0.51 or 0.49 if evidence: *************** *** 457,463 **** clues.insert(0, ('*zspam*', zspam)) clues.insert(0, ('*zham*', zham)) ! return stat, clues else: ! return stat if options.use_central_limit: --- 461,467 ---- clues.insert(0, ('*zspam*', zspam)) clues.insert(0, ('*zham*', zham)) ! return score, clues else: ! return score if options.use_central_limit: *************** *** 533,545 **** zham = (hmean - self.hammean) / sqrt(self.hamvar / n) zspam = (smean - self.spammean) / sqrt(self.spamvar / n) ! stat = abs(zham) - abs(zspam) # > 0 for spam, < 0 for ham ! # Normalize into [0, 1]. I'm arbitrarily clipping it to fit in ! # [-20, 20] first. 20 is a massive z-score difference. ! if stat < -20.0: ! stat = -20.0 ! elif stat > 20.0: ! stat = 20.0 ! stat = 0.5 + stat / 40.0 if evidence: --- 537,553 ---- zham = (hmean - self.hammean) / sqrt(self.hamvar / n) zspam = (smean - self.spammean) / sqrt(self.spamvar / n) ! delta = abs(zham) - abs(zspam) # > 0 for spam, < 0 for ham ! azham, azspam = abs(zham), abs(zspam) ! if azham < azspam: ! ratio = azspam / max(azham, 1e-10) # guard against 0 division ! else: ! ratio = azham / max(azspam, 1e-10) # guard against 0 division ! certain = ratio > options.zscore_ratio_cutoff ! ! if certain: ! score = delta > 0.0 and 1.0 or 0.0 ! else: ! score = delta > 0.0 and 0.51 or 0.49 if evidence: *************** *** 548,554 **** clues.insert(0, ('*zspam*', zspam)) clues.insert(0, ('*zham*', zham)) ! return stat, clues else: ! return stat if options.use_central_limit2: --- 556,562 ---- clues.insert(0, ('*zspam*', zspam)) clues.insert(0, ('*zham*', zham)) ! return score, clues else: ! return score if options.use_central_limit2: From richiehindle@users.sourceforge.net Fri Oct 4 20:41:39 2002 From: richiehindle@users.sourceforge.net (Richie Hindle) Date: Fri, 04 Oct 2002 12:41:39 -0700 Subject: [Spambayes-checkins] spambayes mboxutils.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv1024 Modified Files: mboxutils.py Log Message: Open the mailbox file in binary mode. The mailbox module will break if the mailbox file is opened in text mode on Windows. Index: mboxutils.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mboxutils.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** mboxutils.py 20 Sep 2002 19:30:52 -0000 1.1 --- mboxutils.py 4 Oct 2002 19:41:36 -0000 1.2 *************** *** 87,91 **** mbox = DirOfTxtFileMailbox(name, _factory) else: ! fp = open(name) mbox = mailbox.PortableUnixMailbox(fp, _factory) return iter(mbox) --- 87,91 ---- mbox = DirOfTxtFileMailbox(name, _factory) else: ! fp = open(name, "rb") mbox = mailbox.PortableUnixMailbox(fp, _factory) return iter(mbox) From tim_one@users.sourceforge.net Fri Oct 4 23:23:56 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 04 Oct 2002 15:23:56 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 - New directory Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv25026/Outlook2000 Log Message: Directory /cvsroot/spambayes/spambayes/Outlook2000 added to the repository From tim_one@users.sourceforge.net Fri Oct 4 23:28:49 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 04 Oct 2002 15:28:49 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 README.txt,NONE,1.1 filter.py,NONE,1.1 spam.py,NONE,1.1 train.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv25407 Added Files: README.txt filter.py spam.py train.py Log Message: Outlook 2000 tools, contributed by Sean True (thanks!). Changes I (tim) made from the originals: + Stripped "outlook" prefix from file names (they've got their own directory, no need for it), and changed internal docs accordingly. + Normalized line-endings, and whitespace. + Light edits for coding style (wrapped some long lines, added a blank after many commas). + Added a brief guide to the top of README.txt. I haven't yet tried these. --- NEW FILE: README.txt --- This directory contains tools for using the classifier with Microsoft Outlook 2000, courtesy of Sean True. Note that you need Python's win32com extensions. train.py Train a classifier from Outlook Mail folders. filter.py Moves msgs among Outlook Mail folders, based on classifier score. spam.py Dump Outlook Mail folders into the spam reservoir. Comments from Sean: This code is extremely rudimentary. I am getting bad output saving very large classifiers in training.py. Somewhere over 4MB, they seem to stop working. Outlook will occasionally complain that folders are corrupted after running filter. Closing and reopening Outlook always seems to restore things, with no fuss. Your mileage may vary. Buyer beware. Worth what you paid. Brad Morgan comments that in an environment with multiple InfoStores (message stores?), my simple folder finder does not work. He uses this work around: =============== # This didn't work: # personalFolders = findFolder(folder, 'Personal Folders') # # The following was required: # (Note: I have two infostores and I've hard-coded the index of # 'Personal Folders') infostores = session.InfoStores print "There are %d infostores" % infostores.Count infostore = infostores[1] print "Infostore = ", infostore.Name personalFolders = infostore.RootFolder ================= It deserves an option to select the infostore wanted by name. Enjoy. Copyright transferred to PSF from Sean D. True and WebReply.com. Licensed under PSF, see Tim Peters for IANAL interpretation. Ask me technical questions, and if your mail doesn't get eaten by a broken spam filter, I'll try to help. -- Sean seant@iname.com --- NEW FILE: filter.py --- # Filter, dump messages to and from Outlook Mail folders # Author: Sean D. True, WebReply.Com # October, 2002 # Copyright PSF, license under the PSF license # Make py2exe happy import dbhash, anydbm import sys, os, os.path, cPickle, string, getopt import win32com.client import email import email.Parser from hammie import createbayes, Hammie import classifier def findFolder(f, findName, name=""): folders = f.Folders folder = folders.GetFirst() while folder: nm = "%s/%s" % (name, folder.Name) nm = nm.encode('ascii', 'replace') if nm == findName: return folder try: f = findFolder(folder, findName, nm) if f: return f except: pass folder = folders.GetNext() return None from tokenizer import tokenize def filter(bayes, rootFolder, folderName, targetName=None, over=None, under=None, detail=None): hammie = Hammie(bayes) n = nover = nunder = 0 f = findFolder(rootFolder, folderName) targetf = None if targetName: targetf = findFolder(rootFolder, targetName) if not targetf: print "Can't find folder %s to move messages to" % targetName return messages = f.Messages message = messages.GetFirst() while message: try: headers = "%s" % message.fields[0x7D001E] headers = headers.encode('ascii', 'replace') body = message.Text.encode('ascii', 'replace') n = n + 1 except: message = messages.GetNext() continue text = headers + body prob, clues = hammie.score(text, evidence=1) if over <> None and prob >= over: nover = nover + 1 if detail: print "***Over threshold", prob, over for i in range(1, message.recipients.Count+1): print message.Recipients[i].Address, print message.Subject.encode('ascii','replace') print hammie.formatclues(clues) if targetf: message.MoveTo(targetf.ID) if under <> None and prob <= under: nunder = nunder + 1 if detail: print "***Under threshold", prob, under for i in range(1, message.recipients.Count+1): print message.Recipients[i].Address, print message.Subject.encode('ascii','replace') print hammie.formatclues(clues) if targetf: message.MoveTo(targetf.ID) message = messages.GetNext() print "Total %d, over %d under %d" % (n, nover, nunder) def usage(): print "Usage: filter.py --bayes=bayes.pck --from=folder,folder,folder [--to=folder] [--detail] [--over=float|--under=float]" print """Example: python filter.py --from=/Personal/Hotmail,/Personal/ExJunk --over=.35 --detail --to=/SpamMaybe""" def main(): from hammie import createbayes db_name = 'bayes.pck' folders = [] options = ["over=", "under=", "bayes=", "to=", "from=", "detail"] dodetail=targetName=to=over=under= None opts,args = getopt.getopt(sys.argv[1:], None, options) if args: usage() sys.exit(1) for opt, arg in opts: if opt == "--under": under = float(arg) elif opt == "--over": over = float(arg) elif opt == "--bayes": db_name = arg elif opt == "--to": targetName = arg elif opt == "--from": folders = string.split(arg, ",") elif opt == "--detail": dodetail = 1 if not (over or under) or not folders: usage() sys.exit(1) bayes = cPickle.load(open(db_name,'rb')) cwd = os.getcwd() session = win32com.client.Dispatch("MAPI.Session") session.Logon() personalFolders = findFolder(session.GetFolder(''), '/Top of Personal Folders') for folder in folders: print "Filtering %s, over: %s under %s" % (arg, over, under) filter(bayes, personalFolders, folder, targetName, over=over, under=under, detail=dodetail) session.Logoff() session = None print 'Done' if __name__ == "__main__": main() --- NEW FILE: spam.py --- # Dump Outlook Mail folders into the timcv testing reservoirs # Author: Sean D. True, WebReply.Com # October, 2002 # Copyright PSF, license under the PSF license import os.path, sys, getopt import re, string import win32com.client def findFolder(f,findName, name=""): folders = f.Folders folder = folders.GetFirst() while folder: nm = "%s/%s" % (name, folder.Name) nm = nm.encode('ascii', 'replace') if nm == findName: return folder try: f = findFolder(folder, findName, nm) if f: return f except: pass folder = folders.GetNext() return None def dumpFolder(rootDir, rootFolder,folderName, isspam): if isspam: outputDirectory = "Data\\Spam\\reservoir" else: outputDirectory = "Data\\Ham\\reservoir" outputDirectory = "%s\\%s" % (rootDir, outputDirectory) f = findFolder(rootFolder, folderName) if f == None: print "Can't find folder", folderName return print "dumping folder %s [%s]" % (folderName, f.ID) messages = f.Messages message = messages.GetFirst() n = 0 while message: outfName = os.path.join(outputDirectory, f.Name) outfName = "%s_%d.txt" % (outfName, n) outfName = string.replace(outfName, " ", "") try: s = "%s" % message.fields[0x7D001E] s = s.encode('ascii', 'replace') except: message = messages.GetNext() continue outf = open(outfName, "w") outf.write(s) outf.write(message.Text.encode('ascii', 'replace')) outf.close() message = messages.GetNext() n=n+1 def usage(): print "Usage: spam.py --spam=folder,folder,folder --ham=folder,folder,folder" print """Example: python spam.py --spam=/JunkMail,/Personal/Hotmail,/Personal/Spam --ham="/Dragon People,/WebReply,/House,/Tenberry,/Receipts and coupons,/Rational and MIT,/Lists/List-mod_python,/Lists/List-other,/List-Webware,/Microsoft,/Fishing,/Ebusiness,/Colo,/Amazon" """ def main(): spam = [] ham = [] options = ["ham=", "spam="] opts,args = getopt.getopt(sys.argv[1:], None, options) if args: usage() sys.exit(1) for opt, arg in opts: if opt == "--spam": spam = string.split(arg, ',') elif opt == "--ham": ham = string.split(arg,',') if not spam and not ham: usage() sys.exit(1) cwd = os.getcwd() session = win32com.client.Dispatch("MAPI.Session") session.Logon() personalFolders = findFolder(session.GetFolder(''), '/Top of Personal Folders') for folder in spam: dumpFolder(cwd, personalFolders, folder, 1) for folder in ham: dumpFolder(cwd, personalFolders, folder, 0) session.Logoff() session = None if __name__ == "__main__": main() --- NEW FILE: train.py --- # Train a classifier from Outlook Mail folders # Author: Sean D. True, WebReply.Com # October, 2002 # Copyright PSF, license under the PSF license import sys, os, os.path, getopt, cPickle, string import win32com.client import classifier from tokenizer import tokenize def findFolder(f,findName, name=""): folders = f.Folders folder = folders.GetFirst() while folder: nm = "%s/%s" % (name, folder.Name) nm = nm.encode('ascii','replace') if nm == findName: return folder try: f = findFolder(folder, findName, nm) if f: return f except: pass folder = folders.GetNext() return None def train( bayes, rootFolder,folderName, isspam): f = findFolder(rootFolder, folderName) if not f: print "Can't find folder", folderName return messages = f.Messages if not messages: print "Can't find messages in folder", folderName return message = messages.GetFirst() while message: try: headers = "%s" % message.fields[0x7D001E] headers = headers.encode('ascii', 'replace') body = message.Text.encode('ascii', 'replace') text = headers + body bayes.learn(tokenize(text), isspam, False) except: pass message = messages.GetNext() def usage(): print "Usage: train.py --bayes=bayes.pck --spam=folder,folder,folder --ham=folder,folder,folder" print """Example: python train.py --bayes=bayes.pck --spam=/JunkMail,/Personal/Hotmail,/Personal/Spam --ham="/Dragon People,/WebReply,/House,/Tenberry,/Receipts and coupons,/Rational and MIT,/Lists/List-mod_python,/Lists/List-other,/List-Webware,/Microsoft,/Fishing,/Ebusiness,/Amazon" """ def main(): db_name = 'bayes.pck' spam = [] ham = [] options = ["ham=", "spam=", "bayes="] opts,args = getopt.getopt(sys.argv[1:], None, options) if args: usage() sys.exit(1) for opt,arg in opts: if opt == "--spam": spam = string.split(arg, ',') elif opt == "--ham": ham = string.split(arg,',') elif opt == "--bayes": db_name = arg if not spam and not ham: usage() sys.exit(1) cwd = os.getcwd() session = win32com.client.Dispatch("MAPI.Session") session.Logon() personalFolders = findFolder(session.GetFolder(''), '/Top of Personal Folders') bayes = classifier.Bayes() for folder in spam: print "Training with %s as spam" % folder train(bayes, personalFolders,folder, 1) for folder in ham: print "Training with %s as ham" % folder train(bayes, personalFolders,folder, 0) session.Logoff() session = None print 'Updating probabilities...' bayes.update_probabilities() print ("Done with training %s, built with %d examples and %d counter " "examples" % (db_name, bayes.nspam, bayes.nham)) db_name = os.path.join(cwd, db_name) print 'Writing DB...' cPickle.dump(bayes, open(db_name,"wb"), 1) if __name__ == "__main__": main() From tim_one@users.sourceforge.net Sat Oct 5 03:07:47 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 04 Oct 2002 19:07:47 -0700 Subject: [Spambayes-checkins] spambayes classifier.py,1.28,1.29 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv17337 Modified Files: classifier.py Log Message: Sneak more info into the clues returned by the central limit schemes. Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** classifier.py 4 Oct 2002 18:37:52 -0000 1.28 --- classifier.py 5 Oct 2002 02:07:41 -0000 1.29 *************** *** 459,464 **** clues = [(word, prob) for prob, word, record in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) ! clues.insert(0, ('*zspam*', zspam)) ! clues.insert(0, ('*zham*', zham)) return score, clues else: --- 459,469 ---- clues = [(word, prob) for prob, word, record in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) ! extra = [('*zham*', zham), ! ('*zspam*', zspam), ! ('*hmean*', mean), ! ('*smean*', mean), ! ('*n*', n), ! ] ! clues[0:0] = extra return score, clues else: *************** *** 554,559 **** clues = [(word, prob) for prob, word, record in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) ! clues.insert(0, ('*zspam*', zspam)) ! clues.insert(0, ('*zham*', zham)) return score, clues else: --- 559,569 ---- clues = [(word, prob) for prob, word, record in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) ! extra = [('*zham*', zham), ! ('*zspam*', zspam), ! ('*hmean*', hmean), ! ('*smean*', smean), ! ('*n*', n), ! ] ! clues[0:0] = extra return score, clues else: From tim_one@users.sourceforge.net Sat Oct 5 03:53:46 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 04 Oct 2002 19:53:46 -0700 Subject: [Spambayes-checkins] spambayes clgen.py,NONE,1.1 README.txt,1.30,1.31 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv30600 Modified Files: README.txt Added Files: clgen.py Log Message: A test driver only for use with one of the speculative central-limit schemes. Its purpose is to generate a binary pickle containing internal information about every prediction made. This will go away someday. XXX Still need tools to analyze this data. --- NEW FILE: clgen.py --- #! /usr/bin/env python # A test driver using "the standard" test directory structure, producing # info about the internals of the central-limit schemes. """Usage: %(program)s [options] -n nsets -t int,int,...,int Scores for all predictions are saved at the end to binary pickle clim.pik. This contains two lists of tuples, the first list with a tuple for every ham predicted, the second list with a tuple for every spam predicted. Each tuple has these values: tag the msg identifier is_spam True if msg came from a spam Set, False if from a ham Set zham the msg zscore relative to the population ham zspam the msg zscore relative to the population spam hmean the raw mean ham score smean the raw mean spam score n the number of clues used to judge this msg Note that hmean and smean are the same under use_central_limit; they're very likely to differ under use_central_limit2. Where: -h Show usage and exit. -n int Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...). This is required. -t int,int,...,int Build a classifier training on these Set directories. This is used to predict against the remaining Set directories. This is required. If you only want to use some of the messages in each set, --ham-keep int The maximum number of msgs to use from each Ham set. The msgs are chosen randomly. See also the -s option. --spam-keep int The maximum number of msgs to use from each Spam set. The msgs are chosen randomly. See also the -s option. -s int A seed for the random number generator. Has no effect unless at least on of {--ham-keep, --spam-keep} is specified. If -s isn't specifed, the seed is taken from current time. In addition, an attempt is made to merge bayescustomize.ini into the options. If that exists, it can be used to change the settings in Options.options. """ from __future__ import generators import sys from heapq import heapreplace from sets import Set import cPickle as pickle from Options import options import TestDriver from TestDriver import printmsg import msgs from Histogram import Hist fname = 'clim.pik' program = sys.argv[0] def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) class MyDriver(TestDriver.Driver): def __init__(self): TestDriver.Driver.__init__(self) # tuples of (msg.tag, is_spam, zham, zspam, hmean, smean, n) self.all_ham = [] self.all_spam = [] def test(self, ham, spam): c = self.classifier t = self.tester local_ham_hist = Hist() local_spam_hist = Hist() # clues start with these: # extra = [('*zham*', zham), # ('*zspam*', zspam), # ('*hmean*', hmean), # raw mean as ham # ('*smean*', smean), # raw mean as spam # ('*n*', n), # # For use_central_limit, hmean and smean have the same value. def new_ham(msg, prob, getclues=c.spamprob): local_ham_hist.add(prob * 100.0) prob, clues = getclues(msg, True) stuff = tuple([val for tag, val in clues[:5]]) self.all_ham.append((msg.tag, False) + stuff) def new_spam(msg, prob, getclues=c.spamprob): local_spam_hist.add(prob * 100.0) prob, clues = getclues(msg, True) stuff = tuple([val for tag, val in clues[:5]]) self.all_spam.append((msg.tag, True) + stuff) t.reset_test_results() print "-> Predicting", ham, "&", spam, "..." t.predict(spam, True, new_spam) t.predict(ham, False, new_ham) print "-> tested", t.nham_tested, "hams &", t.nspam_tested, \ "spams against", c.nham, "hams &", c.nspam, "spams" print "-> false positive %:", t.false_positive_rate() print "-> false negative %:", t.false_negative_rate() newfpos = Set(t.false_positives()) - self.falsepos self.falsepos |= newfpos print "-> %d new false positives" % len(newfpos) if newfpos: print " new fp:", [e.tag for e in newfpos] if not options.show_false_positives: newfpos = () for e in newfpos: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newfneg = Set(t.false_negatives()) - self.falseneg self.falseneg |= newfneg print "-> %d new false negatives" % len(newfneg) if newfneg: print " new fn:", [e.tag for e in newfneg] if not options.show_false_negatives: newfneg = () for e in newfneg: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) if options.show_best_discriminators > 0: print print " best discriminators:" stats = [(-1, None)] * options.show_best_discriminators smallest_killcount = -1 for w, r in c.wordinfo.iteritems(): if r.killcount > smallest_killcount: heapreplace(stats, (r.killcount, w)) smallest_killcount = stats[0][0] stats.sort() for count, w in stats: if count < 0: continue r = c.wordinfo[w] print " %r %d %g" % (w, r.killcount, r.spamprob) self.trained_ham_hist = local_ham_hist self.trained_spam_hist = local_spam_hist def ints_to_string(x): return '{' + ','.join(map(str, x)) + '}' def drive(nsets, trainon, predicton): print options.display() spamdirs = [options.spam_directories % i for i in range(1, nsets+1)] hamdirs = [options.ham_directories % i for i in range(1, nsets+1)] train_hamdirs = [hamdirs[i-1] for i in trainon] train_spamdirs = [spamdirs[i-1] for i in trainon] predict_hamdirs = [hamdirs[i-1] for i in predicton] predict_spamdirs = [spamdirs[i-1] for i in predicton] trainints = ints_to_string(trainon) predictints = ints_to_string(predicton) d = MyDriver() hamroot = options.ham_directories[:-2] # lose trailing %d spamroot = options.spam_directories[:-2] d.train(msgs.HamStream(hamroot + trainints, train_hamdirs), msgs.SpamStream(spamroot + trainints, train_spamdirs)) c = d.classifier print '-> population hammean', c.hammean, 'hamvar', c.hamvar print '-> population spammean', c.spammean, 'spamvar', c.spamvar d.test(msgs.HamStream(hamroot + predictints, predict_hamdirs), msgs.SpamStream(spamroot + predictints, predict_spamdirs)) d.finishtest() d.alldone() print "Saving all score data to pickle", fname f = file(fname, 'wb') pickle.dump(d.all_ham, f, 1) pickle.dump(d.all_spam, f, 1) f.close() def main(): import getopt try: opts, args = getopt.getopt(sys.argv[1:], 'hn:s:t:', ['ham-keep=', 'spam-keep=']) except getopt.error, msg: usage(1, msg) nsets = seed = hamkeep = spamkeep = trainon = None for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-n': nsets = int(arg) elif opt == '-s': seed = int(arg) elif opt == '-t': trainon = Set(map(int, arg.split(','))) elif opt == '--ham-keep': hamkeep = int(arg) elif opt == '--spam-keep': spamkeep = int(arg) if args: usage(1, "Positional arguments not supported") if nsets is None: usage(1, "-n is required") if not trainon: usage(1, "-t is required") predicton = list(Set(range(1, nsets+1)) - trainon) trainon = list(trainon) predicton.sort() trainon.sort() msgs.setparms(hamkeep, spamkeep, seed) drive(nsets, trainon, predicton) if __name__ == "__main__": main() Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** README.txt 28 Sep 2002 18:50:51 -0000 1.30 --- README.txt 5 Oct 2002 02:53:43 -0000 1.31 *************** *** 173,176 **** --- 173,186 ---- + Experimental Files + ================== + clgen.py + A test driver only for use with one of the speculative central-limit + schemes. Its purpose is to generate a binary pickle containing + internal information about every prediction made. This will go + away someday. + XXX Still need tools to analyze this data. + + Standard Test Data Setup ======================== From tim_one@users.sourceforge.net Sat Oct 5 05:22:52 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 04 Oct 2002 21:22:52 -0700 Subject: [Spambayes-checkins] spambayes clpik.py,NONE,1.1 README.txt,1.31,1.32 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv25637 Modified Files: README.txt Added Files: clpik.py Log Message: An example analysis program showing how to access the pickles produced by clgen.py, and how to generate potentially interesting histograms from them. --- NEW FILE: clpik.py --- #! /usr/bin/env python # Analyze a clim.pik file. """Usage: %(program)s [options] [central_limit_pickle_file] An example analysis program showing to access info from a central-limit pickle file created by clgen.py. This program produces histograms of various things. Scores for all predictions are saved at the end of binary pickle clim.pik. This contains two lists of tuples, the first list with a tuple for every ham predicted, the second list with a tuple for every spam predicted. Each tuple has these values: tag the msg identifier is_spam True if msg came from a spam Set, False if from a ham Set zham the msg zscore relative to the population ham zspam the msg zscore relative to the population spam hmean the raw mean ham score smean the raw mean spam score n the number of clues used to judge this msg Note that hmean and smean are the same under use_central_limit; they're very likely to differ under use_central_limit2. Where: -h Show usage and exit. -n int Number of histogram buckets to display. Default 100. If no file is named on the cmdline, clim.pik is used. """ import sys import cPickle as pickle from Histogram import Hist fname = 'clim.pik' program = sys.argv[0] def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def dump(nbuckets, tag, n, hmean, zham, smean, zspam): for msg, hist in [('# words', n), ('ham mean', hmean), ('ham zscore', zham), ('spam mean', smean), ('spam zscore', zspam)]: print print tag, msg + ':', hist.display(nbuckets) def drive(fname, nbuckets): print 'Reading', fname, '...' f = open(fname, 'rb') ham = pickle.load(f) spam = pickle.load(f) f.close() print 'Building histograms for', len(ham), 'ham &', len(spam), 'spam' ham_n = Hist(lo=None, hi=None) spam_n = Hist(lo=None, hi=None) ham_as_ham_mean = Hist(lo=None, hi=None) ham_as_spam_mean = Hist(lo=None, hi=None) spam_as_ham_mean = Hist(lo=None, hi=None) spam_as_spam_mean = Hist(lo=None, hi=None) ham_as_ham_zscore = Hist(lo=None, hi=None) ham_as_spam_zscore = Hist(lo=None, hi=None) spam_as_ham_zscore = Hist(lo=None, hi=None) spam_as_spam_zscore = Hist(lo=None, hi=None) for msgid, is_spam, zham, zspam, hmean, smean, n in ham: ham_n.add(n) ham_as_ham_mean.add(hmean) ham_as_ham_zscore.add(zham) ham_as_spam_mean.add(smean) ham_as_spam_zscore.add(zspam) dump(nbuckets, 'ham', ham_n, ham_as_ham_mean, ham_as_ham_zscore, ham_as_spam_mean, ham_as_spam_zscore) for msgid, is_spam, zham, zspam, hmean, smean, n in spam: spam_n.add(n) spam_as_ham_mean.add(hmean) spam_as_ham_zscore.add(zham) spam_as_spam_mean.add(smean) spam_as_spam_zscore.add(zspam) dump(nbuckets, 'spam', spam_n, spam_as_ham_mean, spam_as_ham_zscore, spam_as_spam_mean, spam_as_spam_zscore) def main(): import getopt try: opts, args = getopt.getopt(sys.argv[1:], 'hn:', ['ham-keep=', 'spam-keep=']) except getopt.error, msg: usage(1, msg) nbuckets = 100 for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-n': nbuckets = int(arg) fname = 'clim.pik' if args: fname = args.pop(0) if args: usage(1, "No more than one positional argument allowed") drive(fname, nbuckets) if __name__ == "__main__": main() Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** README.txt 5 Oct 2002 02:53:43 -0000 1.31 --- README.txt 5 Oct 2002 04:22:49 -0000 1.32 *************** *** 180,184 **** internal information about every prediction made. This will go away someday. ! XXX Still need tools to analyze this data. --- 180,188 ---- internal information about every prediction made. This will go away someday. ! ! clpik.py ! An example analysis program showing how to access the pickles ! produced by clgen.py, and how to generate potentially interesting ! histograms from them. From tim_one@users.sourceforge.net Sat Oct 5 06:48:42 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 04 Oct 2002 22:48:42 -0700 Subject: [Spambayes-checkins] spambayes Histogram.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv22010 Modified Files: Histogram.py Log Message: Display: smarter calculation of bucket boundary format, so that no two displayed bucket boundaries are the same. Index: Histogram.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Histogram.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** Histogram.py 4 Oct 2002 03:01:30 -0000 1.3 --- Histogram.py 5 Oct 2002 05:48:40 -0000 1.4 *************** *** 138,146 **** ndigits = len(str(biggest)) ! # Displaying the bucket boundaries is more troublesome. For now, ! # just print one digit after the decimal point, regardless of what ! # the boundaries look like. ! boundary_digits = max(len(str(int(lo))), len(str(int(hi)))) ! format = "%" + str(boundary_digits + 2) + '.1f %' + str(ndigits) + "d" bucketwidth = self.get_bucketwidth() --- 138,153 ---- ndigits = len(str(biggest)) ! # Displaying the bucket boundaries is more troublesome. ! bucketwidth = self.get_bucketwidth() ! whole_digits = max(len(str(int(lo))), ! len(str(int(hi - bucketwidth)))) ! frac_digits = 0 ! while bucketwidth < 1.0: ! # Incrementing by bucketwidth may not change the last displayed ! # digit, so display one more. ! frac_digits += 1 ! bucketwidth *= 10.0 ! format = ("%" + str(whole_digits + 1 + frac_digits) + '.' + ! str(frac_digits) + 'f %' + str(ndigits) + "d") bucketwidth = self.get_bucketwidth() From tim_one@users.sourceforge.net Sat Oct 5 08:18:06 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 05 Oct 2002 00:18:06 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.42,1.43 classifier.py,1.29,1.30 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv18408 Modified Files: Options.py classifier.py Log Message: New option use_central_limit3, which shares its spamprob() method with central_limit2, but computes the ham and spam populations via whole msg scores rather than individual word scores. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** Options.py 4 Oct 2002 18:37:52 -0000 1.42 --- Options.py 5 Oct 2002 07:18:04 -0000 1.43 *************** *** 218,221 **** --- 218,222 ---- # probability complements (p and 1-p) instead. use_central_limit2: False + use_central_limit3: False # For now, a central-limit scheme considers its decision "certain" if the *************** *** 276,279 **** --- 277,281 ---- 'use_central_limit': boolean_cracker, 'use_central_limit2': boolean_cracker, + 'use_central_limit3': boolean_cracker, 'zscore_ratio_cutoff': float_cracker, }, Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** classifier.py 5 Oct 2002 02:07:41 -0000 1.29 --- classifier.py 5 Oct 2002 07:18:04 -0000 1.30 *************** *** 570,573 **** return score ! if options.use_central_limit2: spamprob = central_limit_spamprob2 --- 570,606 ---- return score ! if options.use_central_limit2 or options.use_central_limit3: spamprob = central_limit_spamprob2 + + def central_limit_compute_population_stats3(self, msgstream, is_spam): + from math import ldexp, log + + sum = sumsq = n = 0 + for msg in msgstream: + n += 1 + probsum = 0.0 + clues = self._getclues(msg) + for prob, word, record in clues: + if is_spam: + probsum += log(prob) + else: + probsum += log(1.0 - prob) + mean = long(ldexp(probsum / len(clues), 64)) + sum += mean + sumsq += mean * mean + + mean = ldexp(sum, -64) / n + var = sumsq * n - sum**2 + var = ldexp(var, -128) / n**2 + + if is_spam: + self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq + self.spammean, self.spamvar = mean, var + print 'spammean', self.spammean, 'spamvar', self.spamvar + else: + self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq + self.hammean, self.hamvar = mean, var + print 'hammean', self.hammean, 'hamvar', self.hamvar + + if options.use_central_limit3: + compute_population_stats = central_limit_compute_population_stats3 From tim_one@users.sourceforge.net Sat Oct 5 22:30:58 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 05 Oct 2002 14:30:58 -0700 Subject: [Spambayes-checkins] spambayes classifier.py,1.30,1.31 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv28590 Modified Files: classifier.py Log Message: _getclues(): There are no schemes remaining that benefit from a very small options.max_discriminators, and the priority queue costs more than it saves unless max_discriminators is small. So now we just save all the clues, and sort them at the end, to find the strongest clues. This is measurably faster at max_discriminators=30, and a stronger win the larger max_discriminators gets. Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** classifier.py 5 Oct 2002 07:18:04 -0000 1.30 --- classifier.py 5 Oct 2002 21:30:55 -0000 1.31 *************** *** 25,29 **** import time - from heapq import heapreplace from sets import Set --- 25,28 ---- *************** *** 179,184 **** if evidence: - clues.sort() clues = [(w, p) for p, w, r in clues] return prob, clues else: --- 178,183 ---- if evidence: clues = [(w, p) for p, w, r in clues] + clues.sort(lambda a, b: cmp(a[1], b[1])) return prob, clues else: *************** *** 347,355 **** unknown = options.robinson_probability_x ! # A priority queue to remember the MAX_DISCRIMINATORS best ! # probabilities, where "best" means largest distance from 0.5. ! # The tuples are (distance, prob, word, record). ! nbest = [(-1.0, None, None, None)] * options.max_discriminators ! smallest_best = -1.0 wordinfoget = self.wordinfo.get --- 346,351 ---- unknown = options.robinson_probability_x ! clues = [] # (distance, prob, word, record) tuples ! pushclue = clues.append wordinfoget = self.wordinfo.get *************** *** 363,372 **** prob = record.spamprob distance = abs(prob - 0.5) ! if distance >= mindist and distance > smallest_best: ! heapreplace(nbest, (distance, prob, word, record)) ! smallest_best = nbest[0][0] ! # Return (prob, word, record) for the non-dummies. ! return [t[1:] for t in nbest if t[1] is not None] #************************************************************************ --- 359,370 ---- prob = record.spamprob distance = abs(prob - 0.5) ! if distance >= mindist: ! pushclue((distance, prob, word, record)) ! clues.sort() ! if len(clues) > options.max_discriminators: ! del clues[0 : -options.max_discriminators] ! # Return (prob, word, record). ! return [t[1:] for t in clues] #************************************************************************ From tim_one@users.sourceforge.net Sun Oct 6 00:46:01 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 05 Oct 2002 16:46:01 -0700 Subject: [Spambayes-checkins] spambayes rmspik.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv13010 Added Files: rmspik.py Log Message: This is Rob Hooft's central-limit binary-pickle "RMS ZScore" emulator, edited for coding style (long lines, whitespace). --- NEW FILE: rmspik.py --- #! /usr/bin/env python # Analyze a clim.pik file. """Usage: %(program)s [options] [central_limit_pickle_file] An example analysis program showing to access info from a central-limit pickle file created by clgen.py. This program produces histograms of various things. Scores for all predictions are saved at the end of binary pickle clim.pik. This contains two lists of tuples, the first list with a tuple for every ham predicted, the second list with a tuple for every spam predicted. Each tuple has these values: tag the msg identifier is_spam True if msg came from a spam Set, False if from a ham Set zham the msg zscore relative to the population ham zspam the msg zscore relative to the population spam hmean the raw mean ham score smean the raw mean spam score n the number of clues used to judge this msg Note that hmean and smean are the same under use_central_limit; they're very likely to differ under use_central_limit2. Where: -h Show usage and exit. If no file is named on the cmdline, clim.pik is used. """ surefactor = 1000 # This is basically the inverse of the accepted fp/fn rate punsure = False # Print unsure decisions (otherwise only sure-but-false) import sys,math,os import cPickle as pickle program = sys.argv[0] def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def chance(x): if x>=0: return 1.0 x=-x/math.sqrt(2) if x<1.4: return 1.0 assert x>=1.4 x=float(x) pre=math.exp(-x**2)/math.sqrt(math.pi)/x post=1-(1/(2*x**2)) return pre*post knownfalse = {} def readknownfalse(): global knownfalse knownfalse = {} try: f = open('knownfalse.dat') except IOError: return for line in f: key, desc = line.split(None, 1) knownfalse[key] = desc[:-1] f.close() print "%d descriptions from knownfalse.dat" % len(knownfalse) def prknown(tag): bn = os.path.basename(tag) if bn in knownfalse: print " ==>", knownfalse[bn] def drive(fname): print 'Reading', fname, '...' f = open(fname, 'rb') ham = pickle.load(f) spam = pickle.load(f) f.close() zhamsum2 = 0.0 nham = 0 for msg in ham: if msg[1]: print "spam in ham",msg else: zhamsum2 += msg[2]**2 nham += 1 rmszham = math.sqrt(zhamsum2 / nham) print "Nham=", nham print "RmsZham=", rmszham zspamsum2 = 0.0 nspam = 0 for msg in spam: if not msg[1]: print "ham in spam",msg else: zspamsum2 += msg[3]**2 nspam += 1 rmszspam = math.sqrt(zspamsum2 / nspam) print "Nspam=", nspam print "RmsZspam=", rmszspam #========= Analyze ham print "=" * 70 print "HAM:" nsureok = nunsureok = nunsurenok = nsurenok = 0 for msg in ham: zham = msg[2] / rmszham zspam = msg[3] / rmszspam cham = chance(zham) cspam = chance(zspam) if cham > surefactor*cspam and cham > 0.01: nsureok += 1 # very certain elif cham > cspam: nunsureok += 1 #print "Unsure",msg[0] #prknown(msg[0]) else: if cspam > surefactor*cham and cspam > 0.01: reason = "SURE!" nsurenok += 1 elif cham < 0.01 and cspam < 0.01: reason = "neither?" nunsurenok += 1 elif cham > 0.1 and cspam > 0.1: reason = "both?" nunsurenok += 1 else: reason = "Unsure" nunsurenok += 1 if reason=="SURE!" or punsure: print "FALSE POSITIVE: zham=%.2f zspam=%.2f %s %s" % ( zham, zspam, msg[0], reason) prknown(msg[0]) print "Sure/ok ", nsureok print "Unsure/ok ", nunsureok print "Unsure/not ok", nunsurenok print "Sure/not ok ", nsurenok print "Unsure rate = %.2f%%" % (100.*(nunsureok + nunsurenok) / len(ham)) print "Sure fp rate = %.2f%%; Unsure fp rate = %.2f%%" % ( 100.*nsurenok / (nsurenok + nsureok), 100.*nunsurenok / (nunsurenok + nunsureok)) #========= Analyze spam print "="*70 print "SPAM:" nsureok = nunsureok = nunsurenok = nsurenok = 0 for msg in spam: zham = msg[2] / rmszham zspam = msg[3] / rmszspam cham = chance(zham) cspam = chance(zspam) if cspam > surefactor*cham and cspam > 0.01: nsureok += 1 # very certain elif cspam > cham: nunsureok += 1 #print "Unsure",msg[0] #prknown(msg[0]) else: if cham > surefactor*cspam and cham > 0.01: reason = "SURE!" nsurenok += 1 elif cham < 0.01 and cspam < 0.01: reason = "neither?" nunsurenok += 1 elif cham > 0.1 and cspam > 0.1: reason = "both?" nunsurenok += 1 else: reason = "Unsure" nunsurenok += 1 if reason=="SURE!" or punsure: print "FALSE NEGATIVE: zham=%.2f zspam=%.2f %s %s" % ( zham, zspam, msg[0], reason) prknown(msg[0]) print "Sure/ok ", nsureok print "Unsure/ok ", nunsureok print "Unsure/not ok", nunsurenok print "Sure/not ok ", nsurenok print "Unsure rate = %.2f%%"% (100.*(nunsureok + nunsurenok) / len(ham)) print "Sure fn rate = %.2f%%; Unsure fn rate = %.2f%%" % ( 100.*nsurenok / (nsurenok + nsureok), 100.*nunsurenok / (nunsurenok + nunsureok)) def main(): import getopt try: opts, args = getopt.getopt(sys.argv[1:], 'h') except getopt.error, msg: usage(1, msg) nbuckets = 100 for opt, arg in opts: if opt == '-h': usage(0) fname = 'clim.pik' if args: fname = args.pop(0) if args: usage(1, "No more than one positional argument allowed") readknownfalse() drive(fname) if __name__ == "__main__": main() From npickett@users.sourceforge.net Sun Oct 6 01:23:43 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Sat, 05 Oct 2002 17:23:43 -0700 Subject: [Spambayes-checkins] spambayes README.txt,1.32,1.33 hammie.py,1.27,1.28 runtest.sh,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv20019 Modified Files: README.txt hammie.py runtest.sh Log Message: * Updated README to mention hammiesrv. * hammie now supports -r flag which makes -u show hams, not spams (thanks Alexander Leidinger). * hammie now prints summary of all -u runs instead of one summary per run (thanks Alexander Leidinger). If you liked it better the other way, please let me know. Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** README.txt 5 Oct 2002 04:22:49 -0000 1.32 --- README.txt 6 Oct 2002 00:23:40 -0000 1.33 *************** *** 74,78 **** hammie.py A spamassassin-like filter which uses tokenizer and classifier (above). ! Needs to be made faster, especially for writes. pop3proxy.py --- 74,81 ---- hammie.py A spamassassin-like filter which uses tokenizer and classifier (above). ! ! hammiesrv.py ! A first stab at making hammie into a client/server model, using ! XML-RPC. pop3proxy.py Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammie.py,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** hammie.py 1 Oct 2002 15:07:45 -0000 1.27 --- hammie.py 6 Oct 2002 00:23:40 -0000 1.28 *************** *** 18,21 **** --- 18,24 ---- mbox of unknown messages. A ham/spam decision is reported for each. Can be specified more than once. + -r + reverse the meaning of the check (report ham instead of spam). + Only meaningful with the -u option. -p FILE use file as the persistent store. loads data from this file if it *************** *** 27,31 **** -f run as a filter: read a single message from stdin, add an ! %(DISPHEADER)s header, and write it to stdout. """ --- 30,35 ---- -f run as a filter: read a single message from stdin, add an ! %(DISPHEADER)s header, and write it to stdout. If you want to ! run from procmail, this is your option. """ *************** *** 314,318 **** print ! def score(hammie, msgs): """Score (judge) all messages from a mailbox.""" # XXX The reporting needs work! --- 318,322 ---- print ! def score(hammie, msgs, reverse=0): """Score (judge) all messages from a mailbox.""" # XXX The reporting needs work! *************** *** 323,338 **** i += 1 prob, clues = hammie.score(msg, True) - isspam = prob >= SPAM_THRESHOLD if hasattr(msg, '_mh_msgno'): msgno = msg._mh_msgno else: msgno = i if isspam: spams += 1 ! print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."), ! print hammie.formatclues(clues) else: hams += 1 ! print "Total %d spam, %d ham" % (spams, hams) def createbayes(pck=DEFAULTDB, usedb=False): --- 327,346 ---- i += 1 prob, clues = hammie.score(msg, True) if hasattr(msg, '_mh_msgno'): msgno = msg._mh_msgno else: msgno = i + isspam = (prob >= SPAM_THRESHOLD) if isspam: spams += 1 ! if not reverse: ! print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."), ! print hammie.formatclues(clues) else: hams += 1 ! if reverse: ! print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."), ! print hammie.formatclues(clues) ! return (spams, hams) def createbayes(pck=DEFAULTDB, usedb=False): *************** *** 366,370 **** """Main program; parse options and go.""" try: ! opts, args = getopt.getopt(sys.argv[1:], 'hdfg:s:p:u:') except getopt.error, msg: usage(2, msg) --- 374,378 ---- """Main program; parse options and go.""" try: ! opts, args = getopt.getopt(sys.argv[1:], 'hdfg:s:p:u:r') except getopt.error, msg: usage(2, msg) *************** *** 377,380 **** --- 385,389 ---- spam = [] unknown = [] + reverse = 0 do_filter = usedb = False for opt, arg in opts: *************** *** 393,396 **** --- 402,407 ---- elif opt == '-u': unknown.append(arg) + elif opt == '-r': + reverse = 1 if args: usage(2, "Positional arguments not allowed") *************** *** 424,431 **** if unknown: for u in unknown: if len(unknown) > 1: print "Scoring", u ! score(h, u) if __name__ == "__main__": --- 435,447 ---- if unknown: + (spams, hams) = (0, 0) for u in unknown: if len(unknown) > 1: print "Scoring", u ! s, h = score(h, u, reverse) ! spams += s ! hams += h ! print "Total %d spam, %d ham" % (spams, hams) ! if __name__ == "__main__": Index: runtest.sh =================================================================== RCS file: /cvsroot/spambayes/spambayes/runtest.sh,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** runtest.sh 1 Oct 2002 17:53:54 -0000 1.5 --- runtest.sh 6 Oct 2002 00:23:40 -0000 1.6 *************** *** 25,40 **** # Number of messages per rebalanced set ! RNUM=200 # Number of sets ! SETS=5 if [ -n "$REBAL" ]; then # Put them all into reservoirs ! python rebal.py -r Data/Ham/reservoir -s Data/Ham/Set -n 0 -Q ! python rebal.py -r Data/Spam/reservoir -s Data/Spam/Set -n 0 -Q # Rebalance ! python rebal.py -r Data/Ham/reservoir -s Data/Ham/Set -n $RNUM -Q ! python rebal.py -r Data/Spam/reservoir -s Data/Spam/Set -n $RNUM -Q fi --- 25,40 ---- # Number of messages per rebalanced set ! RNUM=${REBAL_RNUM:-200} # Number of sets ! SETS=${REBAL_SETS:-5} if [ -n "$REBAL" ]; then # Put them all into reservoirs ! python rebal.py -r Data/Ham/reservoir -s Data/Ham/Set -n 0 -q ! python rebal.py -r Data/Spam/reservoir -s Data/Spam/Set -n 0 -q # Rebalance ! python rebal.py -r Data/Ham/reservoir -s Data/Ham/Set -n $RNUM -q -Q ! python rebal.py -r Data/Spam/reservoir -s Data/Spam/Set -n $RNUM -q -Q fi *************** *** 49,52 **** --- 49,56 ---- python cmp.py run1s run2s | tee results.txt + ;; + *) + echo "Available targets:" + sed -n 's/^\( [a-z|]*\))$/\1/p' $0 ;; esac From tim_one@users.sourceforge.net Sun Oct 6 06:06:28 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 05 Oct 2002 22:06:28 -0700 Subject: [Spambayes-checkins] spambayes classifier.py,1.31,1.32 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv10691 Modified Files: classifier.py Log Message: update_probabilities(): Recorded a comment about an equivalent way of writing Gary's probability adjustment that I find much easier to picture. Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** classifier.py 5 Oct 2002 21:30:55 -0000 1.31 --- classifier.py 6 Oct 2002 05:06:26 -0000 1.32 *************** *** 253,256 **** --- 253,266 ---- # f(w) = -------------- # s + n + # + # I find this easier to reason about like so (equivalent when + # s != 0): + # + # x - p + # p + ------- + # 1 + n/s + # + # IOW, it moves p a fraction of the distance from p to x, and + # less so the larger n is, or the smaller s is. n = hamcount + spamcount From tim_one@users.sourceforge.net Sun Oct 6 06:11:43 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 05 Oct 2002 22:11:43 -0700 Subject: [Spambayes-checkins] spambayes README.txt,1.33,1.34 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv11758 Modified Files: README.txt Log Message: Added a blurb about Rob Hooft's rmspik.py. Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** README.txt 6 Oct 2002 00:23:40 -0000 1.33 --- README.txt 6 Oct 2002 05:11:41 -0000 1.34 *************** *** 189,192 **** --- 189,200 ---- histograms from them. + rmspik.py + A program that analyzes a clgen-produced pickle, and tells you what + would happen if we had used Rob Hooft's "RMS ZScore" scheme for + deciding certainty instead. + CAUTION: This doesn't work as intended for plain use_central_limit. + The chance() function seems to make an assumption that's true + only under use_central_limit2 and use_central_limit3. + Standard Test Data Setup From tim_one@users.sourceforge.net Sun Oct 6 06:24:12 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 05 Oct 2002 22:24:12 -0700 Subject: [Spambayes-checkins] spambayes rmspik.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv13405 Modified Files: rmspik.py Log Message: The module docstring makes some sense now. Added horizontal whitespace to overly busy expressions. Added XXX comment about chance()'s problems with the original use_central_limit. Slashed the number of int->float conversions needed by chance(). Index: rmspik.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/rmspik.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** rmspik.py 5 Oct 2002 23:45:59 -0000 1.1 --- rmspik.py 6 Oct 2002 05:24:10 -0000 1.2 *************** *** 5,39 **** """Usage: %(program)s [options] [central_limit_pickle_file] ! An example analysis program showing to access info from a central-limit ! pickle file created by clgen.py. This program produces histograms of ! various things. ! ! Scores for all predictions are saved at the end of binary pickle clim.pik. ! This contains two lists of tuples, the first list with a tuple for every ! ham predicted, the second list with a tuple for every spam predicted. Each ! tuple has these values: ! ! tag the msg identifier ! is_spam True if msg came from a spam Set, False if from a ham Set ! zham the msg zscore relative to the population ham ! zspam the msg zscore relative to the population spam ! hmean the raw mean ham score ! smean the raw mean spam score ! n the number of clues used to judge this msg ! ! Note that hmean and smean are the same under use_central_limit; they're ! very likely to differ under use_central_limit2. ! ! Where: -h Show usage and exit. If no file is named on the cmdline, clim.pik is used. """ ! surefactor = 1000 # This is basically the inverse of the accepted fp/fn rate ! punsure = False # Print unsure decisions (otherwise only sure-but-false) ! import sys,math,os import cPickle as pickle --- 5,23 ---- """Usage: %(program)s [options] [central_limit_pickle_file] ! Options -h Show usage and exit. + Analyzes a pickle produced by clgen.py, and displays what would happen + if Rob Hooft's "RMS ZScore" scheme had been used to determine certainty + instead. + If no file is named on the cmdline, clim.pik is used. """ ! surefactor = 1000 # This is basically the inverse of the accepted fp/fn rate ! punsure = False # Print unsure decisions (otherwise only sure-but-false) ! import sys, math, os import cPickle as pickle *************** *** 49,62 **** def chance(x): ! if x>=0: return 1.0 ! x=-x/math.sqrt(2) ! if x<1.4: return 1.0 ! assert x>=1.4 ! x=float(x) ! pre=math.exp(-x**2)/math.sqrt(math.pi)/x ! post=1-(1/(2*x**2)) ! return pre*post knownfalse = {} --- 33,47 ---- def chance(x): ! # XXX These 3 lines are a disaster for spam using the original ! # use_central_limit. Replacing with x = abs(x)/sqrt(2) works ! # very well then. ! if x >= 0: return 1.0 ! x = -x / math.sqrt(2.0) ! if x < 1.4: return 1.0 ! pre = math.exp(-x**2) / math.sqrt(math.pi) / x ! post = 1.0 - (1.0 / (2.0 * x**2)) ! return pre * post knownfalse = {} *************** *** 79,82 **** --- 64,77 ---- if bn in knownfalse: print " ==>", knownfalse[bn] + + # Pickle tuple contents: + # + # 0 tag the msg identifier + # 1 is_spam True if msg came from a spam Set, False if from a ham Set + # 2 zham the msg zscore relative to the population ham + # 3 zspam the msg zscore relative to the population spam + # 4 hmean the raw mean ham score + # 5 smean the raw mean spam score + # 6 n the number of clues used to judge this msg def drive(fname): From npickett@users.sourceforge.net Sun Oct 6 07:47:38 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Sat, 05 Oct 2002 23:47:38 -0700 Subject: [Spambayes-checkins] spambayes runtest.sh,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv29449 Modified Files: runtest.sh Log Message: * Adding support for timtest.py (integrates patch 618928) Index: runtest.sh =================================================================== RCS file: /cvsroot/spambayes/spambayes/runtest.sh,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** runtest.sh 6 Oct 2002 00:23:40 -0000 1.6 --- runtest.sh 6 Oct 2002 06:47:36 -0000 1.7 *************** *** 10,15 **** ## Just set up your messages as detailed in README.txt; put them all in ## the reservoir directories, and this script will take care of the ! ## rest. Paste the output (also in results.txt) to the mailing list for ! ## good karma. ## ## Neale Pickett --- 10,14 ---- ## Just set up your messages as detailed in README.txt; put them all in ## the reservoir directories, and this script will take care of the ! ## rest. Paste the output to the mailing list for good karma. ## ## Neale Pickett *************** *** 22,26 **** # Which test to run ! TEST=${1:-robinson1} # Number of messages per rebalanced set --- 21,25 ---- # Which test to run ! TEST=${1:-run2} # Number of messages per rebalanced set *************** *** 40,48 **** case "$TEST" in ! run1) ! python timcv.py -n $SETS > run1.txt ;; ! run2|useold) ! python timcv.py -n $SETS > run2.txt python rates.py run1 run2 > runrates.txt --- 39,53 ---- case "$TEST" in ! test1) ! python timtest.py -n $SETS > test1.txt ;; ! test2) ! python timtest.py -n $SETS > test2.txt ! ;; ! timcv1|cv1) ! python timcv.py -n $SETS > cv1.txt ! ;; ! timcv2|cv2) ! python timcv.py -n $SETS > cv2.txt python rates.py run1 run2 > runrates.txt *************** *** 52,56 **** *) echo "Available targets:" ! sed -n 's/^\( [a-z|]*\))$/\1/p' $0 ;; esac --- 57,61 ---- *) echo "Available targets:" ! sed -n 's/^\( [a-z0-9|]*\))$/\1/p' $0 ;; esac From npickett@users.sourceforge.net Sun Oct 6 07:50:57 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Sat, 05 Oct 2002 23:50:57 -0700 Subject: [Spambayes-checkins] spambayes runtest.sh,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv30095 Modified Files: runtest.sh Log Message: * Gah, name change didn't make it to all targets. Should be consistent for cv runs now. Index: runtest.sh =================================================================== RCS file: /cvsroot/spambayes/spambayes/runtest.sh,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** runtest.sh 6 Oct 2002 06:47:36 -0000 1.7 --- runtest.sh 6 Oct 2002 06:50:55 -0000 1.8 *************** *** 51,57 **** python timcv.py -n $SETS > cv2.txt ! python rates.py run1 run2 > runrates.txt ! python cmp.py run1s run2s | tee results.txt ;; *) --- 51,57 ---- python timcv.py -n $SETS > cv2.txt ! python rates.py cv1 cv2 > runrates.txt ! python cmp.py cv1s cv2s | tee results.txt ;; *) From hooft@users.sourceforge.net Sun Oct 6 19:28:17 2002 From: hooft@users.sourceforge.net (Rob W.W. Hooft) Date: Sun, 06 Oct 2002 11:28:17 -0700 Subject: [Spambayes-checkins] spambayes rmspik.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv16970 Modified Files: rmspik.py Log Message: - new version of chance() function (more accurate, which made a difference) - more tunable parameters. Index: rmspik.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/rmspik.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** rmspik.py 6 Oct 2002 05:24:10 -0000 1.2 --- rmspik.py 6 Oct 2002 18:28:15 -0000 1.3 *************** *** 16,21 **** """ ! surefactor = 1000 # This is basically the inverse of the accepted fp/fn rate ! punsure = False # Print unsure decisions (otherwise only sure-but-false) import sys, math, os --- 16,61 ---- """ ! ! # surefactor: the ratio of the two p's to decide we're sure a message ! # belongs to one of the two populations. raising this number increases ! # the "unsures" on both sides, decreasing the "sure fp" and "sure fn" ! # rates. A value of 1000 works well for me; at 10000 you get slightly ! # less sure fp/fn at a cost of a lot more middle ground; at 10 you have ! # much less work on the middle ground but ~50% more "sure false" ! # scores. This variable operates on messages that are "a bit of both ! # ham and spam" ! surefactor = 100 ! ! # pminhamsure: The minimal pham at which we say it's surely ham ! # lowering this value gives less "unsure ham" and more "sure ham"; it ! # might however result in more "sure fn" 0.01 works well, but to accept ! # a bit more fn, I set it to 0.005. This variable operates on messages ! # that are "neither ham nor spam; but a bit more ham than spam" ! pminhamsure = 0.005 ! ! # pminspamsure: The minimal pspam at which we say it's surely spam ! # lowering this value gives less "unsure spam" and more "sure spam"; it ! # might however result in more "sure fp" Since most people find fp ! # worse than fn, this value should most probably be higher than ! # pminhamsure. 0.01 works well, but to accept a bit less fp, I set it ! # to 0.02. This variable operates on messages that are "neither ham ! # nor spam; but a bit more spam than ham" ! pminspamsure = 0.02 ! ! ! # usetail: if False, use complete distributions to renormalize the ! # Z-scores; if True, use only the worst tail value. I get worse results ! # if I set this to True, so the default is False. ! usetail = False ! ! # medianoffset: If True, set the median of the zham and zspam to 0 ! # before calculating rmsZ. If False, do not shift the data and hence ! # assume that 0 is the center of the population. True seems to help for ! # my data. ! medianoffset = True ! ! punsure = False # Print unsure decisions (otherwise only sure-but-false) ! exthist=0 # Prepare files to make histograms of values using an ! # external program import sys, math, os *************** *** 23,26 **** --- 63,68 ---- program = sys.argv[0] + HAMVAL=2 + SPAMVAL=3 def usage(code, msg=''): *************** *** 33,51 **** def chance(x): ! # XXX These 3 lines are a disaster for spam using the original ! # use_central_limit. Replacing with x = abs(x)/sqrt(2) works ! # very well then. ! if x >= 0: ! return 1.0 ! x = -x / math.sqrt(2.0) ! if x < 1.4: ! return 1.0 ! pre = math.exp(-x**2) / math.sqrt(math.pi) / x ! post = 1.0 - (1.0 / (2.0 * x**2)) ! return pre * post ! knownfalse = {} def readknownfalse(): global knownfalse knownfalse = {} --- 75,97 ---- def chance(x): ! x=abs(x) ! if x<0.5: ! return 1 ! p=-0.5*math.log(2*math.pi)-0.5*x**2-math.log(x)+math.log(1-(x**-2)+3*(x**-4)) ! return min(1.0,math.exp(p)) ! def Z(p): # Reverse of chance ! x=math.log(p) ! z=math.sqrt(-2.0*x-math.log(2*math.pi)) ! for n in range(8): ! errfac=chance(z)/p ! z=z+0.5*math.log(errfac) ! return z + knownfalse = {} def readknownfalse(): + """Read a file named "knownfalse.dat" with the basename of the + file as the first word on a line, and a short description + following it.""" global knownfalse knownfalse = {} *************** *** 81,109 **** spam = pickle.load(f) f.close() ! ! zhamsum2 = 0.0 nham = 0 for msg in ham: ! if msg[1]: ! print "spam in ham",msg ! else: ! zhamsum2 += msg[2]**2 ! nham += 1 ! rmszham = math.sqrt(zhamsum2 / nham) print "Nham=", nham ! print "RmsZham=", rmszham ! ! zspamsum2 = 0.0 nspam = 0 for msg in spam: ! if not msg[1]: ! print "ham in spam",msg ! else: ! zspamsum2 += msg[3]**2 ! nspam += 1 ! rmszspam = math.sqrt(zspamsum2 / nspam) print "Nspam=", nspam ! print "RmsZspam=", rmszspam ! #========= Analyze ham print "=" * 70 --- 127,191 ---- spam = pickle.load(f) f.close() ! if exthist: ! fham=open('ham.dat','w') ! fspam=open('spam.dat','w') nham = 0 + hamham=[] for msg in ham: ! assert not msg[1] ! if exthist: ! print >> fham, "%.2f %.2f %.2f %.2f"%msg[2:6] ! hamham.append(msg[HAMVAL]) ! nham += 1 print "Nham=", nham ! hamham.sort() ! if medianoffset: ! hammedian=hamham[nham/2] ! else: ! hammedian=0.0 ! if usetail: ! hamham.sort() ! fac = Z(10./nham) ! z1 = -(hamham[10]-hammedian)/fac ! z99 = (hamham[-10]-hammedian)/fac ! print "rmsZlo, rmsZhi= %.2f %.2f"%(z1,z99) ! rmszham = max(z1,z99) ! else: ! zhamsum2 = 0.0 ! for msg in ham: ! zhamsum2 += (msg[HAMVAL]-hammedian)**2 ! rmszham = math.sqrt(zhamsum2 / nham) ! print "RmsZham=", rmszham ! nspam = 0 + spamspam=[] for msg in spam: ! assert msg[1] ! if exthist: ! print >> fspam, "%.2f %.2f %.2f %.2f"%msg[2:6] ! spamspam.append(msg[SPAMVAL]) ! nspam += 1 print "Nspam=", nspam ! spamspam.sort() ! if medianoffset: ! spammedian=spamspam[nspam/2] ! else: ! spammedian=0.0 ! if usetail: ! fac=Z(10./nspam) ! z1=-(spamspam[10]-spammedian)/fac ! z99=(spamspam[-10]-spammedian)/fac ! print "rmsZlo, rmsZhi= %.2f %.2f"%(z1,z99) ! rmszspam = max(z1,z99) ! else: ! zspamsum2 = 0.0 ! for msg in spam: ! zspamsum2 += (msg[SPAMVAL]-spammedian)**2 ! rmszspam = math.sqrt(zspamsum2 / nspam) ! print "RmsZspam=", rmszspam ! ! if exthist: ! fham.close() ! fspam.close() #========= Analyze ham print "=" * 70 *************** *** 111,119 **** nsureok = nunsureok = nunsurenok = nsurenok = 0 for msg in ham: ! zham = msg[2] / rmszham ! zspam = msg[3] / rmszspam cham = chance(zham) cspam = chance(zspam) ! if cham > surefactor*cspam and cham > 0.01: nsureok += 1 # very certain elif cham > cspam: --- 193,201 ---- nsureok = nunsureok = nunsurenok = nsurenok = 0 for msg in ham: ! zham = (msg[HAMVAL]-hammedian) / rmszham ! zspam = (msg[SPAMVAL]-spammedian) / rmszspam cham = chance(zham) cspam = chance(zspam) ! if cham > surefactor*cspam and cham > pminhamsure: nsureok += 1 # very certain elif cham > cspam: *************** *** 122,126 **** #prknown(msg[0]) else: ! if cspam > surefactor*cham and cspam > 0.01: reason = "SURE!" nsurenok += 1 --- 204,208 ---- #prknown(msg[0]) else: ! if cspam > surefactor*cham and cspam > pminspamsure: reason = "SURE!" nsurenok += 1 *************** *** 151,159 **** nsureok = nunsureok = nunsurenok = nsurenok = 0 for msg in spam: ! zham = msg[2] / rmszham ! zspam = msg[3] / rmszspam cham = chance(zham) cspam = chance(zspam) ! if cspam > surefactor*cham and cspam > 0.01: nsureok += 1 # very certain elif cspam > cham: --- 233,241 ---- nsureok = nunsureok = nunsurenok = nsurenok = 0 for msg in spam: ! zham = (msg[HAMVAL]-hammedian) / rmszham ! zspam = (msg[SPAMVAL]-spammedian) / rmszspam cham = chance(zham) cspam = chance(zspam) ! if cspam > surefactor*cham and cspam > pminspamsure: nsureok += 1 # very certain elif cspam > cham: *************** *** 162,166 **** #prknown(msg[0]) else: ! if cham > surefactor*cspam and cham > 0.01: reason = "SURE!" nsurenok += 1 --- 244,248 ---- #prknown(msg[0]) else: ! if cham > surefactor*cspam and cham > pminhamsure: reason = "SURE!" nsurenok += 1 From tim_one@users.sourceforge.net Mon Oct 7 00:07:08 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 06 Oct 2002 16:07:08 -0700 Subject: [Spambayes-checkins] spambayes rmspik.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv2898 Modified Files: rmspik.py Log Message: Style edits, mostly whitespace issues. Index: rmspik.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/rmspik.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** rmspik.py 6 Oct 2002 18:28:15 -0000 1.3 --- rmspik.py 6 Oct 2002 23:07:06 -0000 1.4 *************** *** 25,29 **** # scores. This variable operates on messages that are "a bit of both # ham and spam" ! surefactor = 100 # pminhamsure: The minimal pham at which we say it's surely ham --- 25,29 ---- # scores. This variable operates on messages that are "a bit of both # ham and spam" ! surefactor = 100 # pminhamsure: The minimal pham at which we say it's surely ham *************** *** 32,36 **** # a bit more fn, I set it to 0.005. This variable operates on messages # that are "neither ham nor spam; but a bit more ham than spam" ! pminhamsure = 0.005 # pminspamsure: The minimal pspam at which we say it's surely spam --- 32,36 ---- # a bit more fn, I set it to 0.005. This variable operates on messages # that are "neither ham nor spam; but a bit more ham than spam" ! pminhamsure = 0.005 # pminspamsure: The minimal pspam at which we say it's surely spam *************** *** 41,51 **** # to 0.02. This variable operates on messages that are "neither ham # nor spam; but a bit more spam than ham" ! pminspamsure = 0.02 ! ! # usetail: if False, use complete distributions to renormalize the # Z-scores; if True, use only the worst tail value. I get worse results # if I set this to True, so the default is False. ! usetail = False # medianoffset: If True, set the median of the zham and zspam to 0 --- 41,51 ---- # to 0.02. This variable operates on messages that are "neither ham # nor spam; but a bit more spam than ham" ! pminspamsure = 0.02 ! ! # usetail: if False, use complete distributions to renormalize the # Z-scores; if True, use only the worst tail value. I get worse results # if I set this to True, so the default is False. ! usetail = False # medianoffset: If True, set the median of the zham and zspam to 0 *************** *** 53,61 **** # assume that 0 is the center of the population. True seems to help for # my data. ! medianoffset = True ! ! punsure = False # Print unsure decisions (otherwise only sure-but-false) ! exthist=0 # Prepare files to make histograms of values using an ! # external program import sys, math, os --- 53,64 ---- # assume that 0 is the center of the population. True seems to help for # my data. ! medianoffset = True ! ! # Print unsure decisions (otherwise only sure-but-false). ! punsure = False ! ! # Prepare files to make histograms of values using an ! # external program. ! exthist = False import sys, math, os *************** *** 63,68 **** program = sys.argv[0] ! HAMVAL=2 ! SPAMVAL=3 def usage(code, msg=''): --- 66,81 ---- program = sys.argv[0] ! ! # Pickle tuple contents: ! # ! # 0 tag the msg identifier ! # 1 is_spam True if msg came from a spam Set, False if from a ham Set ! # 2 zham the msg zscore relative to the population ham ! # 3 zspam the msg zscore relative to the population spam ! # 4 hmean the raw mean ham score ! # 5 smean the raw mean spam score ! # 6 n the number of clues used to judge this msg ! HAMVAL = 2 ! SPAMVAL = 3 def usage(code, msg=''): *************** *** 75,90 **** def chance(x): ! x=abs(x) ! if x<0.5: return 1 ! p=-0.5*math.log(2*math.pi)-0.5*x**2-math.log(x)+math.log(1-(x**-2)+3*(x**-4)) ! return min(1.0,math.exp(p)) def Z(p): # Reverse of chance ! x=math.log(p) ! z=math.sqrt(-2.0*x-math.log(2*math.pi)) for n in range(8): ! errfac=chance(z)/p ! z=z+0.5*math.log(errfac) return z --- 88,106 ---- def chance(x): ! x = abs(x) ! if x < 0.5: return 1 ! p = (-0.5 * math.log(2*math.pi) - ! 0.5 * x**2 - ! math.log(x) + ! math.log(1 - x**-2 + 3*x**-4)) ! return min(1.0, math.exp(p)) def Z(p): # Reverse of chance ! x = math.log(p) ! z = math.sqrt(-2.0*x - math.log(2*math.pi)) for n in range(8): ! errfac = chance(z)/p ! z = z + 0.5*math.log(errfac) return z *************** *** 111,123 **** print " ==>", knownfalse[bn] - # Pickle tuple contents: - # - # 0 tag the msg identifier - # 1 is_spam True if msg came from a spam Set, False if from a ham Set - # 2 zham the msg zscore relative to the population ham - # 3 zspam the msg zscore relative to the population spam - # 4 hmean the raw mean ham score - # 5 smean the raw mean spam score - # 6 n the number of clues used to judge this msg def drive(fname): --- 127,130 ---- *************** *** 131,139 **** fspam=open('spam.dat','w') nham = 0 ! hamham=[] for msg in ham: assert not msg[1] if exthist: ! print >> fham, "%.2f %.2f %.2f %.2f"%msg[2:6] hamham.append(msg[HAMVAL]) nham += 1 --- 138,146 ---- fspam=open('spam.dat','w') nham = 0 ! hamham = [] for msg in ham: assert not msg[1] if exthist: ! print >> fham, "%.2f %.2f %.2f %.2f" % msg[2:6] hamham.append(msg[HAMVAL]) nham += 1 *************** *** 141,167 **** hamham.sort() if medianoffset: ! hammedian=hamham[nham/2] else: ! hammedian=0.0 if usetail: hamham.sort() fac = Z(10./nham) ! z1 = -(hamham[10]-hammedian)/fac ! z99 = (hamham[-10]-hammedian)/fac ! print "rmsZlo, rmsZhi= %.2f %.2f"%(z1,z99) ! rmszham = max(z1,z99) else: zhamsum2 = 0.0 for msg in ham: ! zhamsum2 += (msg[HAMVAL]-hammedian)**2 rmszham = math.sqrt(zhamsum2 / nham) print "RmsZham=", rmszham ! nspam = 0 ! spamspam=[] for msg in spam: assert msg[1] if exthist: ! print >> fspam, "%.2f %.2f %.2f %.2f"%msg[2:6] spamspam.append(msg[SPAMVAL]) nspam += 1 --- 148,174 ---- hamham.sort() if medianoffset: ! hammedian = hamham[nham // 2] else: ! hammedian = 0.0 if usetail: hamham.sort() fac = Z(10./nham) ! z1 = -(hamham[10] - hammedian) / fac ! z99 = (hamham[-10] - hammedian) / fac ! print "rmsZlo, rmsZhi= %.2f %.2f" % (z1, z99) ! rmszham = max(z1, z99) else: zhamsum2 = 0.0 for msg in ham: ! zhamsum2 += (msg[HAMVAL] - hammedian)**2 rmszham = math.sqrt(zhamsum2 / nham) print "RmsZham=", rmszham ! nspam = 0 ! spamspam= [] for msg in spam: assert msg[1] if exthist: ! print >> fspam, "%.2f %.2f %.2f %.2f" % msg[2:6] spamspam.append(msg[SPAMVAL]) nspam += 1 *************** *** 169,181 **** spamspam.sort() if medianoffset: ! spammedian=spamspam[nspam/2] else: ! spammedian=0.0 if usetail: ! fac=Z(10./nspam) ! z1=-(spamspam[10]-spammedian)/fac ! z99=(spamspam[-10]-spammedian)/fac ! print "rmsZlo, rmsZhi= %.2f %.2f"%(z1,z99) ! rmszspam = max(z1,z99) else: zspamsum2 = 0.0 --- 176,188 ---- spamspam.sort() if medianoffset: ! spammedian = spamspam[nspam/2] else: ! spammedian = 0.0 if usetail: ! fac = Z(10./nspam) ! z1 = -(spamspam[10] - spammedian) / fac ! z99 = (spamspam[-10] - spammedian) / fac ! print "rmsZlo, rmsZhi= %.2f %.2f" % (z1, z99) ! rmszspam = max(z1, z99) else: zspamsum2 = 0.0 *************** *** 184,188 **** rmszspam = math.sqrt(zspamsum2 / nspam) print "RmsZspam=", rmszspam ! if exthist: fham.close() --- 191,195 ---- rmszspam = math.sqrt(zspamsum2 / nspam) print "RmsZspam=", rmszspam ! if exthist: fham.close() *************** *** 229,233 **** 100.*nunsurenok / (nunsurenok + nunsureok)) #========= Analyze spam ! print "="*70 print "SPAM:" nsureok = nunsureok = nunsurenok = nsurenok = 0 --- 236,240 ---- 100.*nunsurenok / (nunsurenok + nunsureok)) #========= Analyze spam ! print "=" * 70 print "SPAM:" nsureok = nunsureok = nunsurenok = nsurenok = 0 From tim_one@users.sourceforge.net Mon Oct 7 00:07:26 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 06 Oct 2002 16:07:26 -0700 Subject: [Spambayes-checkins] spambayes hammie.py,1.28,1.29 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv3135 Modified Files: hammie.py Log Message: Whitespace normalization. Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammie.py,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** hammie.py 6 Oct 2002 00:23:40 -0000 1.28 --- hammie.py 6 Oct 2002 23:07:23 -0000 1.29 *************** *** 443,447 **** hams += h print "Total %d spam, %d ham" % (spams, hams) ! if __name__ == "__main__": --- 443,447 ---- hams += h print "Total %d spam, %d ham" % (spams, hams) ! if __name__ == "__main__": From tim_one@users.sourceforge.net Mon Oct 7 05:36:56 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 06 Oct 2002 21:36:56 -0700 Subject: [Spambayes-checkins] spambayes classifier.py,1.32,1.33 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv12834 Modified Files: classifier.py Log Message: central_limit_compute_population_stats()/ central_limit_compute_population_stats2()/ central_limit_compute_population_stats3(): Moved common code into new helper method _add_popstats(). Also arranged to add the new stats to any stats that may already exist. This allows a form of incremental training, so long as you never try to forget a msg after one of the compute_population_stats() methods is called. It's unknown whether it will be a *useful* form of incremental training -- it's cheating, but there is no non-cheating way short of retraining on every msg ever trained on. Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** classifier.py 6 Oct 2002 05:06:26 -0000 1.32 --- classifier.py 7 Oct 2002 04:36:54 -0000 1.33 *************** *** 385,388 **** --- 385,411 ---- # to only one of the alternatives surviving. + def _add_popstats(self, sum, sumsq, n, is_spam): + from math import ldexp + + if is_spam: + sum += self.spamsum + sumsq += self.spamsumsq + n += self.spamn + self.spamsum, self.spamsumsq, self.spamn = sum, sumsq, n + else: + sum += self.hamsum + sumsq += self.hamsumsq + n += self.hamn + self.hamsum, self.hamsumsq, self.hamn = sum, sumsq, n + + mean = ldexp(sum, -64) / n + var = sumsq * n - sum**2 + var = ldexp(var, -128) / n**2 + + if is_spam: + self.spammean, self.spamvar = mean, var + else: + self.hammean, self.hamvar = mean, var + def central_limit_compute_population_stats(self, msgstream, is_spam): from math import ldexp *************** *** 398,417 **** sum += prob sumsq += prob * prob - n = len(seen) ! if is_spam: ! self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq ! spamsum = self.spamsum ! self.spammean = ldexp(spamsum, -64) / self.spamn ! spamvar = self.spamsumsq * self.spamn - spamsum**2 ! self.spamvar = ldexp(spamvar, -128) / (self.spamn ** 2) ! print 'spammean', self.spammean, 'spamvar', self.spamvar ! else: ! self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq ! hamsum = self.hamsum ! self.hammean = ldexp(hamsum, -64) / self.hamn ! hamvar = self.hamsumsq * self.hamn - hamsum**2 ! self.hamvar = ldexp(hamvar, -128) / (self.hamn ** 2) ! print 'hammean', self.hammean, 'hamvar', self.hamvar if options.use_central_limit: --- 421,426 ---- sum += prob sumsq += prob * prob ! self._add_popstats(sum, sumsq, len(seen), is_spam) if options.use_central_limit: *************** *** 499,515 **** sumsq += prob * prob ! n = len(seen) ! mean = ldexp(sum, -64) / n ! var = sumsq * n - sum**2 ! var = ldexp(var, -128) / n**2 ! ! if is_spam: ! self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq ! self.spammean, self.spamvar = mean, var ! print 'spammean', self.spammean, 'spamvar', self.spamvar ! else: ! self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq ! self.hammean, self.hamvar = mean, var ! print 'hammean', self.hammean, 'hamvar', self.hamvar if options.use_central_limit2: --- 508,512 ---- sumsq += prob * prob ! self._add_popstats(sum, sumsq, len(seen), is_spam) if options.use_central_limit2: *************** *** 598,613 **** sumsq += mean * mean ! mean = ldexp(sum, -64) / n ! var = sumsq * n - sum**2 ! var = ldexp(var, -128) / n**2 ! ! if is_spam: ! self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq ! self.spammean, self.spamvar = mean, var ! print 'spammean', self.spammean, 'spamvar', self.spamvar ! else: ! self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq ! self.hammean, self.hamvar = mean, var ! print 'hammean', self.hammean, 'hamvar', self.hamvar if options.use_central_limit3: --- 595,599 ---- sumsq += mean * mean ! self._add_popstats(sum, sumsq, n, is_spam) if options.use_central_limit3: From tim_one@users.sourceforge.net Mon Oct 7 05:42:00 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 06 Oct 2002 21:42:00 -0700 Subject: [Spambayes-checkins] spambayes TestDriver.py,1.19,1.20 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv17051 Modified Files: TestDriver.py Log Message: Updated comments to explain what's wrong with the new "incremental training" ability for the central-limit schemes. Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** TestDriver.py 4 Oct 2002 02:29:20 -0000 1.19 --- TestDriver.py 7 Oct 2002 04:41:58 -0000 1.20 *************** *** 111,116 **** self.trained_spam_hist = Hist() ! # CAUTION: this just doesn't work for incrememental training when ! # options.use_central_limit is in effect. def train(self, ham, spam): print "-> Training on", ham, "&", spam, "...", --- 111,122 ---- self.trained_spam_hist = Hist() ! # CAUTION: When options.use_central_limit{,2,3} is in effect, this ! # adds the new population statistics to the existing population statistics ! # (if any), but the existing population statistics are no longer correct ! # due to the new data we just added (which can change spamprobs, and ! # even the *set* of extreme words). There's no thoroughly correct way ! # to repair this short of recomputing the population statistics for ! # every msg *ever* trained on. It's currently unknown how badly this ! # cheat may affect results. def train(self, ham, spam): print "-> Training on", ham, "&", spam, "...", *************** *** 122,127 **** c.compute_population_stats(spam, True) ! # CAUTION: this just doesn't work for incrememental training when ! # options.use_central_limit is in effect. def untrain(self, ham, spam): print "-> Forgetting", ham, "&", spam, "...", --- 128,133 ---- c.compute_population_stats(spam, True) ! # CAUTION: this doesn't work at all for incrememental training when ! # options.use_central_limit{,2,3} is in effect. def untrain(self, ham, spam): print "-> Forgetting", ham, "&", spam, "...", From npickett@users.sourceforge.net Tue Oct 8 18:38:23 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Tue, 08 Oct 2002 10:38:23 -0700 Subject: [Spambayes-checkins] spambayes hammiesrv.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv25401 Modified Files: hammiesrv.py Log Message: * hammiesrv can now handle xmlrpclib.Binary objects, so you can send it 8-bit messages. Index: hammiesrv.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiesrv.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** hammiesrv.py 27 Sep 2002 21:04:06 -0000 1.4 --- hammiesrv.py 8 Oct 2002 17:38:21 -0000 1.5 *************** *** 35,38 **** --- 35,54 ---- DEFAULTDB = hammie.DEFAULTDB + class XMLHammie(hammie.Hammie): + def score(self, msg, **kwargs): + try: + msg = msg.data + except AttributeError: + pass + return hammie.Hammie.score(self, msg, **kwargs) + + def filter(self, msg, **kwargs): + try: + msg = msg.data + except AttributeError: + pass + return hammie.Hammie.filter(self, msg, **kwargs) + + class HammieHandler(SimpleXMLRPCServer.SimpleXMLRPCRequestHandler): def do_POST(self): *************** *** 114,118 **** bayes = hammie.createbayes(pck, usedb) ! h = hammie.Hammie(bayes) server = SimpleXMLRPCServer.SimpleXMLRPCServer((ip, port), HammieHandler) --- 130,134 ---- bayes = hammie.createbayes(pck, usedb) ! h = XMLHammie(bayes) server = SimpleXMLRPCServer.SimpleXMLRPCServer((ip, port), HammieHandler) From npickett@users.sourceforge.net Tue Oct 8 19:13:51 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Tue, 08 Oct 2002 11:13:51 -0700 Subject: [Spambayes-checkins] spambayes Histogram.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv9158 Modified Files: Histogram.py Log Message: * Make hist.display work with default args. Index: Histogram.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Histogram.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** Histogram.py 5 Oct 2002 05:48:40 -0000 1.4 --- Histogram.py 8 Oct 2002 18:13:49 -0000 1.5 *************** *** 1,2 **** --- 1,3 ---- + #! /usr/bin/env python import math *************** *** 112,115 **** --- 113,118 ---- # data point is in the collection. def display(self, nbuckets=None, WIDTH=61): + if nbuckets is None: + nbuckets = self.nbuckets if nbuckets <= 0: raise ValueError("nbuckets %g > 0 required" % nbuckets) From tim_one@users.sourceforge.net Wed Oct 9 09:35:26 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 09 Oct 2002 01:35:26 -0700 Subject: [Spambayes-checkins] spambayes README.txt,1.34,1.35 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv14846 Modified Files: README.txt Log Message: Added clues about the test drivers, snipped from email I sent to the list. Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** README.txt 6 Oct 2002 05:11:41 -0000 1.34 --- README.txt 9 Oct 2002 08:35:24 -0000 1.35 *************** *** 48,56 **** classifier.py ! An implementation of a Graham-like classifier. tokenizer.py An implementation of tokenize() that Tim can't seem to help but keep ! working on . Tester.py --- 48,57 ---- classifier.py ! The classifier, which is the soul of the method. tokenizer.py An implementation of tokenize() that Tim can't seem to help but keep ! working on . Generates a token stream from a message, which ! the classifier trains on or predicts against. Tester.py *************** *** 105,117 **** mailbox files rather than the specialized timtest setup. - timtest.py - A concrete test driver like mboxtest.py, but working with "a - standard" test data setup (see below) rather than the specialized - mboxtest setup. This runs an NxN test grid, skipping the diagonal. - timcv.py ! A first stab at an N-fold cross-validating test driver. Assumes ! "a standard" data directory setup (see below). ! Subject to arbitrary change. --- 106,141 ---- mailbox files rather than the specialized timtest setup. timcv.py ! An N-fold cross-validating test driver. Assumes "a standard" data ! directory setup (see below)) rather than the specialized mboxtest ! setup. ! N classifiers are built ! 1 run is done with each classifier. ! Each classifier is trained on N-1 sets, and predicts against the sole ! remaining set (the set not used to train the classifier). ! mboxtest does the same. ! timcv should not be used for central limit tests (timcv does ! incremental learning and unlearning, for efficiency; the central ! limit schemes can't unlearn incrementally, and their incremental ! learning ability is a cheat whose badness isn't yet known). ! This (or mboxtest) is the preferred way to test when possible: it ! makes best use of limited data, and interpreting results is ! straightforward. ! ! timtest.py ! A concrete test driver like mboxtest.py, but working with "a standard" ! test data setup (see below). This runs an NxN test grid, skipping ! the diagonal. ! N classifiers are built. ! N-1 runs are done with each classifier. ! Each classifier is trained on 1 set, and predicts against each of ! the N-1 remaining sets (those not used to train the classifier). ! This is a much harder test than timcv, because it trains on N-1 times ! less data, and makes each classifier predict against N-1 times ! more data than it's been taught about. ! It's harder to interpret the results of timtest (than timcv) correctly, ! because each msg is predicted against N-1 times overall. So, e.g., ! one terribly difficult spam or ham can count against you N-1 times. ! Central limit tests are fine with timtest. From tim_one@users.sourceforge.net Thu Oct 10 01:23:53 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 09 Oct 2002 17:23:53 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.43,1.44 classifier.py,1.33,1.34 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv9993 Modified Files: Options.py classifier.py Log Message: New option use_tim_combining. See the mailing list. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.43 retrieving revision 1.44 diff -C2 -d -r1.43 -r1.44 *** Options.py 5 Oct 2002 07:18:04 -0000 1.43 --- Options.py 10 Oct 2002 00:23:51 -0000 1.44 *************** *** 207,210 **** --- 207,220 ---- # away, or a bunch of incompatible stuff above may go away. + # For the default scheme, use "tim-combining" of probabilities. This has + # no effect under the central-limit schemes. Tim-combining is a kind of + # cross between Paul Graham's and Gary Robinson's combining schemes. Unlike + # Paul's, it's never crazy-certain, and compared to Gary's, in Tim's tests it + # greatly increased the spread between mean ham-scores and spam-scores, while + # simultaneously decreasing the variance of both. Tim needed a higher + # spam_cutoff value for best results, but spam_cutoff is less touchy + # than under Gary-combining. + use_tim_combining: False + # Use a central-limit approach for scoring. # The number of extremes to use is given by max_discriminators (above). *************** *** 279,282 **** --- 289,294 ---- 'use_central_limit3': boolean_cracker, 'zscore_ratio_cutoff': float_cracker, + + 'use_tim_combining': boolean_cracker, }, } Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** classifier.py 7 Oct 2002 04:36:54 -0000 1.33 --- classifier.py 10 Oct 2002 00:23:51 -0000 1.34 *************** *** 385,388 **** --- 385,452 ---- # to only one of the alternatives surviving. + def tim_spamprob(self, wordstream, evidence=False): + """Return best-guess probability that wordstream is spam. + + wordstream is an iterable object producing words. + The return value is a float in [0.0, 1.0]. + + If optional arg evidence is True, the return value is a pair + probability, evidence + where evidence is a list of (word, probability) pairs. + """ + + from math import frexp + + # The real H = this H times 2**Hexp. Likewise for S. We're + # simulating unbounded dynamic float range by hand. If this pans + # out, *maybe* we should store logarithms in the database instead + # and just add them here. But I like keeping raw counts in the + # database (they're easy to understand, manipulate and combine), + # and there's no evidence that this simulation is a significant + # expense. + # S is a spamminess measure, and is the geometric mean of the + # extreme-word spamprobs. + # H is a hamminess measure, and is the geometric mean of 1 - the + # extreme-word spamprobs. + H = S = 1.0 + Hexp = Sexp = 0 + clues = self._getclues(wordstream) + for prob, word, record in clues: + if record is not None: # else wordinfo doesn't know about it + record.killcount += 1 + S *= prob + H *= 1.0 - prob + if S < 1e-200: # move back into range + S, e = frexp(S) + Sexp += e + if H < 1e-200: # move back into range + H, e = frexp(H) + Hexp += e + + S, e = frexp(S) + Sexp += e + H, e = frexp(H) + Hexp += e + + num_clues = len(clues) + if num_clues: + # (x*2**e)**n = x**n * 2**(e*n). + n = 1.0 / num_clues + S = S**n * 2.0**(Sexp * n) + H = H**n * 2.0**(Hexp * n) + prob = S/(S+H) + else: + prob = 0.5 + + if evidence: + clues = [(w, p) for p, w, r in clues] + clues.sort(lambda a, b: cmp(a[1], b[1])) + return prob, clues + else: + return prob + + if options.use_tim_combining: + spamprob = tim_spamprob + def _add_popstats(self, sum, sumsq, n, is_spam): from math import ldexp From tim_one@users.sourceforge.net Thu Oct 10 05:55:17 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 09 Oct 2002 21:55:17 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.44,1.45 timcv.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv26404 Modified Files: Options.py timcv.py Log Message: Adapted from a patch by T. Alexander Popiel, this adds new option (and in a new section) [CV Driver] build_each_classifier_from_scratch: False When True, a cross-validation driver can be used safely-- but more slowly --with a central-limit test. timcv.py pays attention to this. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** Options.py 10 Oct 2002 00:23:51 -0000 1.44 --- Options.py 10 Oct 2002 04:55:15 -0000 1.45 *************** *** 173,176 **** --- 173,190 ---- ham_directories: Data/Ham/Set%d + [CV Driver] + # A cross-validation driver takes N ham+spam sets, and builds N classifiers, + # training each on N-1 sets, and the predicting against the set not trained + # on. By default, it does this in a clever way, learning *and* unlearning + # sets as it goes along, so that it never needs to train on N-1 sets in one + # gulp after the first time. However, that can't always be done: in + # particular, the central-limit schemes can't unlearn incrementally, and can + # learn incrementally only via a form of cheating whose bad effects overall + # aren't yet known. + # So when desiring to run a central-limit test, set + # build_each_classifier_from_scratch to true. This gives correct results, + # but runs much slower than a CV driver usually runs. + build_each_classifier_from_scratch: False + [Classifier] # The maximum number of extreme words to look at in a msg, where "extreme" *************** *** 280,283 **** --- 294,299 ---- 'best_cutoff_fp_weight': float_cracker, }, + 'CV Driver': {'build_each_classifier_from_scratch': boolean_cracker, + }, 'Classifier': {'max_discriminators': int_cracker, 'robinson_probability_x': float_cracker, Index: timcv.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/timcv.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** timcv.py 24 Sep 2002 05:37:11 -0000 1.9 --- timcv.py 10 Oct 2002 04:55:15 -0000 1.10 *************** *** 68,73 **** if i > 0: ! # Forget this set. ! d.untrain(hamstream, spamstream) # Predict this set. --- 68,88 ---- if i > 0: ! if options.build_each_classifier_from_scratch: ! # Build a new classifier from the other sets. ! d.new_classifier() ! ! hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1) ! h2 = hamdirs[:] ! del h2[i] ! ! sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1) ! s2 = spamdirs[:] ! del s2[i] ! ! d.train(msgs.HamStream(hname, h2), msgs.SpamStream(sname, s2)) ! ! else: ! # Forget this set. ! d.untrain(hamstream, spamstream) # Predict this set. *************** *** 75,79 **** d.finishtest() ! if i < nsets - 1: # Add this set back in. d.train(hamstream, spamstream) --- 90,94 ---- d.finishtest() ! if i < nsets - 1 and not options.build_each_classifier_from_scratch: # Add this set back in. d.train(hamstream, spamstream) From sjoerd@users.sourceforge.net Thu Oct 10 10:21:57 2002 From: sjoerd@users.sourceforge.net (Sjoerd Mullender) Date: Thu, 10 Oct 2002 02:21:57 -0700 Subject: [Spambayes-checkins] spambayes runtest.sh,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv4827 Modified Files: runtest.sh Log Message: Count the number of available sets if no REBAL_SETS in the environment. Also fix the command to dump the available targets. Index: runtest.sh =================================================================== RCS file: /cvsroot/spambayes/spambayes/runtest.sh,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** runtest.sh 6 Oct 2002 06:50:55 -0000 1.8 --- runtest.sh 10 Oct 2002 09:21:55 -0000 1.9 *************** *** 27,31 **** # Number of sets ! SETS=${REBAL_SETS:-5} if [ -n "$REBAL" ]; then --- 27,44 ---- # Number of sets ! case ${REBAL_SETS:-undefined} in ! undefined) ! # count the number of sets ! i=1 ! while [ -d Data/Ham/Set$i -a -d Data/Spam/Set$i ]; do ! i=`expr $i + 1` ! done ! SETS=`expr $i - 1` ! ;; ! *) ! # use the provided value ! SETS=${REBAL_SETS} ! ;; ! esac if [ -n "$REBAL" ]; then *************** *** 57,61 **** *) echo "Available targets:" ! sed -n 's/^\( [a-z0-9|]*\))$/\1/p' $0 ;; esac --- 70,74 ---- *) echo "Available targets:" ! sed -n 's/^\( *[a-z0-9|]*\))$/\1/p' $0 ;; esac From tim_one@users.sourceforge.net Sat Oct 12 01:31:01 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 11 Oct 2002 17:31:01 -0700 Subject: [Spambayes-checkins] spambayes chi2.py,NONE,1.1 Options.py,1.45,1.46 classifier.py,1.34,1.35 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv10089 Modified Files: Options.py classifier.py Added Files: chi2.py Log Message: New option use_chi_squared_combining. This is so speculative I'm not even going to say what it is . --- NEW FILE: chi2.py --- import math as _math def chi2Q(x2, v, exp=_math.exp): """Return prob(chisq >= x2, with v degrees of freedom). v must be even. """ assert v & 1 == 0 m = x2 / 2.0 sum = term = exp(-m) for i in range(1, v//2): term *= m / i sum += term return sum def main(): import random from Histogram import Hist import sys class WrappedRandom: # There's no way W-H is equidistributed in 50 dimensions, so use # Marsaglia-wrapping to shuffle it more. def __init__(self, baserandom=random.random, tabsize=513): self.baserandom = baserandom self.n = tabsize self.tab = [baserandom() for i in range(tabsize)] self.next = baserandom() def random(self): result = self.next i = int(result * self.n) self.next = self.tab[i] self.tab[i] = self.baserandom() return result random = WrappedRandom().random def judge(ps, ln=_math.log): H = S = 0.0 for p in ps: S += ln(1.0 - p) H += ln(p) n = len(ps) S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) return S/(S+H) warp = 0 bias = 0.99 if len(sys.argv) > 1: warp = int(sys.argv[1]) if len(sys.argv) > 2: bias = float(sys.argv[2]) h = Hist(20, lo=0.0, hi=1.0) for i in range(5000): ps = [random() for j in range(50)] p = judge(ps + [bias] * warp) h.add(p) print "Result for random vectors of 50 probs, +", warp, "forced to", bias print h.display() if __name__ == '__main__': main() Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** Options.py 10 Oct 2002 04:55:15 -0000 1.45 --- Options.py 12 Oct 2002 00:30:58 -0000 1.46 *************** *** 231,234 **** --- 231,236 ---- use_tim_combining: False + use_chi_squared_combining: False + # Use a central-limit approach for scoring. # The number of extremes to use is given by max_discriminators (above). *************** *** 307,310 **** --- 309,313 ---- 'use_tim_combining': boolean_cracker, + 'use_chi_squared_combining': boolean_cracker, }, } Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** classifier.py 10 Oct 2002 00:23:51 -0000 1.34 --- classifier.py 12 Oct 2002 00:30:58 -0000 1.35 *************** *** 28,31 **** --- 28,33 ---- from Options import options + if options.use_chi_squared_combining: + from chi2 import chi2Q # The maximum number of extreme words to look at in a msg, where "extreme" *************** *** 448,451 **** --- 450,492 ---- if options.use_tim_combining: spamprob = tim_spamprob + + def chi2_spamprob(self, wordstream, evidence=False): + """Return best-guess probability that wordstream is spam. + + wordstream is an iterable object producing words. + The return value is a float in [0.0, 1.0]. + + If optional arg evidence is True, the return value is a pair + probability, evidence + where evidence is a list of (word, probability) pairs. + """ + + from math import log as ln + + H = S = 0.0 + clues = self._getclues(wordstream) + for prob, word, record in clues: + if record is not None: # else wordinfo doesn't know about it + record.killcount += 1 + S += ln(1.0 - prob) + H += ln(prob) + + n = len(clues) + if n: + S = 1.0 - chi2Q(-2.0 * S, 2*n) + H = 1.0 - chi2Q(-2.0 * H, 2*n) + prob = S/(S+H) + else: + prob = 0.5 + + if evidence: + clues = [(w, p) for p, w, r in clues] + clues.sort(lambda a, b: cmp(a[1], b[1])) + return prob, clues + else: + return prob + + if options.use_chi_squared_combining: + spamprob = chi2_spamprob def _add_popstats(self, sum, sumsq, n, is_spam): From hooft@users.sourceforge.net Sun Oct 13 19:01:54 2002 From: hooft@users.sourceforge.net (Rob W.W. Hooft) Date: Sun, 13 Oct 2002 11:01:54 -0700 Subject: [Spambayes-checkins] spambayes cvcost.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv23711 Added Files: cvcost.py Log Message: total cost calculation based on timcv output --- NEW FILE: cvcost.py --- #! /usr/bin/env python """Determine best hamcutoff and spamcutoff values from timcv output. Usage: %(program)s [options] [input filenames] Where options are: -h display this message and exit -u unknown-cost The cost to you of an uncertain message (Default $0.20) -p fp-cost The cost to you of a false positive (Default $10) -n fn-cost The cost to you of a false negative (Default $1) """ import sys # Defaults unknowncost=0.2 fpcost=10 fncost=1 program = sys.argv[0] def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def cost(spamhist,hamhist,hamcut,spamcut): spamok=0 spamunknown=0 spamnok=0 hamok=0 hamunknown=0 hamnok=0 for v,cnt in spamhist: if v Ham scores for all runs'): state=1 elif state==1: if line.startswith('*'): state=2 elif state==2: word=line.split() try: v=float(word[0]) cnt=int(word[1]) hamhist.append((v,cnt)) except IndexError: state=3 elif state==3: if line.startswith('*'): state=4 elif state==4: word=line.split() try: v=float(word[0]) cnt=int(word[1]) spamhist.append((v,cnt)) except ValueError: state=5 besthamcut=50 bestspamcut=80 bestcost=cost(spamhist,hamhist,besthamcut,bestspamcut) for hamcut in range(1,90): sys.stdout.write(".") sys.stdout.flush() for spamcut in range(max(51,hamcut),100): trial=cost(spamhist,hamhist,hamcut,spamcut) if trial<=bestcost: besthamcut=hamcut bestspamcut=spamcut bestcost=trial sys.stdout.write("\n") print "Optimal cost is $%.1f with grey zone between %.1f and %.1f"%(bestcost,besthamcut,bestspamcut) if __name__=="__main__": import getopt try: opts, args = getopt.getopt(sys.argv[1:], 'p:n:u:', []) except getopt.error, msg: usage(1, msg) for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-p': fpcost = float(arg) elif opt == '-n': fncost = float(arg) elif opt == '-u': unknowncost = float(arg) if unknowncost>=fncost or unknowncost>=fpcost: raise ValueError("This program requires that unknowns are cheaper than fp or fn") for fn in args: main(fn) From tim_one@users.sourceforge.net Sun Oct 13 20:00:00 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 13 Oct 2002 12:00:00 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.46,1.47 chi2.py,1.2,1.3 classifier.py,1.35,1.36 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv6950 Modified Files: Options.py chi2.py classifier.py Log Message: Documented use_chi_squared_combining in Options.py. Added new option use_z_combining, and supporting code in chi2.py for computing zscore<->prob for the unit normal distribution. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** Options.py 12 Oct 2002 00:30:58 -0000 1.46 --- Options.py 13 Oct 2002 18:59:56 -0000 1.47 *************** *** 231,236 **** --- 231,255 ---- use_tim_combining: False + # For vectors of random, uniformly distributed probabilities, -2*sum(ln(p_i)) + # follows the chi-squared distribution with 2*n degrees of freedom. That's + # the "provably most-sensitive" test Gary's original scheme was monotonic + # with. Getting closer to the theoretical basis appears to give an excellent + # combining method, usually very extreme in its judgment, yet finding a tiny + # (in # of msgs, spread across a huge range of scores) middle ground where + # lots of the mistakes live. This is the best method so far on Tim's data. + # One systematic benefit is that it's immune to "cancellation disease". One + # systematic drawback is that it's sensitive to *any* deviation from a + # uniform distribution, regardless of whether that's actually evidence of + # ham or spam. Rob Hooft may have a pragmatic cure for that (combine the + # final S and H measures via (S-H+1)/2 instead of via S/(S+H)). use_chi_squared_combining: False + # z_combining is a scheme Gary has discussed with me offline. I'll say more + # if it proves promising. In initial tests it was even more extreme than + # chi combining, but not always in a good way -- in particular, it appears + # as vulnerable to "cancellation disease" as Graham-combining, giving one + # spam in my corpus a score of 4.1e-14 (chi combining scored it 0.5). + use_z_combining: False + # Use a central-limit approach for scoring. # The number of extremes to use is given by max_discriminators (above). *************** *** 310,313 **** --- 329,333 ---- 'use_tim_combining': boolean_cracker, 'use_chi_squared_combining': boolean_cracker, + 'use_z_combining': boolean_cracker, }, } Index: chi2.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/chi2.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** chi2.py 12 Oct 2002 22:56:17 -0000 1.2 --- chi2.py 13 Oct 2002 18:59:57 -0000 1.3 *************** *** 14,17 **** --- 14,85 ---- return sum + def normZ(z, sqrt2pi=_math.sqrt(2.0*_math.pi), exp=_math.exp): + "Return value of the unit Gaussian at z." + return exp(-z*z/2.0) / sqrt2pi + + def normP(z): + """Return area under the unit Gaussian from -inf to z. + + This is the probability that a zscore is <= z. + """ + + # This is very accurate in a fixed-point sense. For negative z of + # large magnitude (<= -8.3), it returns 0.0, essentially because + # P(-z) is, to machine precision, indistiguishable from 1.0 then. + + # sum <- area from 0 to abs(z). + a = abs(float(z)) + if a >= 8.3: + sum = 0.5 + else: + sum2 = term = a * normZ(a) + z2 = a*a + sum = 0.0 + i = 1.0 + while sum != sum2: + sum = sum2 + i += 2.0 + term *= z2 / i + sum2 += term + + if z >= 0: + result = 0.5 + sum + else: + result = 0.5 - sum + + return result + + def normIQ(p, sqrt=_math.sqrt, ln=_math.log): + """Return z such that the area under the unit Gaussian from z to +inf is p. + + Must have 0.0 <= p <= 1.0. + """ + + assert 0.0 <= p <= 1.0 + # This is a low-accuracy rational approximation from Abramowitz & Stegun. + # The absolute error is bounded by 3e-3. + + flipped = False + if p > 0.5: + flipped = True + p = 1.0 - p + + if p == 0.0: + z = 8.3 + else: + t = sqrt(-2.0 * ln(p)) + z = t - (2.30753 + .27061*t) / (1. + .99229*t + .04481*t**2) + + if flipped: + z = -z + return z + + def normIP(p): + """Return z such that the area under the unit Gaussian from -inf to z is p. + + Must have 0.0 <= p <= 1.0. + """ + return normIQ(1.0 - p) + def main(): from Histogram import Hist *************** *** 58,62 **** h = Hist(20, lo=0.0, hi=1.0) ! for i in range(50000): ps = [random() for j in range(50)] p = judge(ps + [bias] * warp) --- 126,130 ---- h = Hist(20, lo=0.0, hi=1.0) ! for i in range(5000): ps = [random() for j in range(50)] p = judge(ps + [bias] * warp) Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** classifier.py 12 Oct 2002 00:30:58 -0000 1.35 --- classifier.py 13 Oct 2002 18:59:57 -0000 1.36 *************** *** 30,33 **** --- 30,35 ---- if options.use_chi_squared_combining: from chi2 import chi2Q + if options.use_z_combining: + from chi2 import normP, normIP # The maximum number of extreme words to look at in a msg, where "extreme" *************** *** 489,492 **** --- 491,534 ---- if options.use_chi_squared_combining: spamprob = chi2_spamprob + + def z_spamprob(self, wordstream, evidence=False): + """Return best-guess probability that wordstream is spam. + + wordstream is an iterable object producing words. + The return value is a float in [0.0, 1.0]. + + If optional arg evidence is True, the return value is a pair + probability, evidence + where evidence is a list of (word, probability) pairs. + """ + + from math import sqrt + + clues = self._getclues(wordstream) + zsum = 0.0 + for prob, word, record in clues: + if record is not None: # else wordinfo doesn't know about it + record.killcount += 1 + zsum += normIP(prob) + + n = len(clues) + if n: + # We've added n zscores from a unit normal distribution. By the + # central limit theorem, their mean is normally distributed with + # mean 0 and sdev 1/sqrt(n). So the zscore of zsum/n is + # (zsum/n - 0)/(1/sqrt(n)) = zsum/n/(1/sqrt(n)) = zsum/sqrt(n). + prob = normP(zsum / sqrt(n)) + else: + prob = 0.5 + + if evidence: + clues = [(w, p) for p, w, r in clues] + clues.sort(lambda a, b: cmp(a[1], b[1])) + return prob, clues + else: + return prob + + if options.use_z_combining: + spamprob = z_spamprob def _add_popstats(self, sum, sumsq, n, is_spam): From tim_one@users.sourceforge.net Sun Oct 13 20:06:10 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 13 Oct 2002 12:06:10 -0700 Subject: [Spambayes-checkins] spambayes classifier.py,1.36,1.37 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv8962 Modified Files: classifier.py Log Message: chi2_spamprob(): Added '*S*' and '*H*' pseudo-clues, as suggested by Rob Hooft. z_spamprob(): Added '*n*', '*zsum*' and '*zscore*' pseudo-clues. Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** classifier.py 13 Oct 2002 18:59:57 -0000 1.36 --- classifier.py 13 Oct 2002 19:06:08 -0000 1.37 *************** *** 485,488 **** --- 485,490 ---- clues = [(w, p) for p, w, r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) + clues.insert(0, ('*S*', S)) + clues.insert(0, ('*H*', H)) return prob, clues else: *************** *** 525,528 **** --- 527,533 ---- clues = [(w, p) for p, w, r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) + clues.insert(0, ('*zsum*', zsum)) + clues.insert(0, ('*n*', n)) + clues.insert(0, ('*zscore*', zsum / sqrt(n or 1))) return prob, clues else: From hooft@users.sourceforge.net Sun Oct 13 20:13:49 2002 From: hooft@users.sourceforge.net (Rob W.W. Hooft) Date: Sun, 13 Oct 2002 12:13:49 -0700 Subject: [Spambayes-checkins] spambayes cvcost.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv11826 Modified Files: cvcost.py Log Message: added filename to output Index: cvcost.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/cvcost.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** cvcost.py 13 Oct 2002 18:01:48 -0000 1.1 --- cvcost.py 13 Oct 2002 19:13:47 -0000 1.2 *************** *** 108,112 **** bestcost=trial sys.stdout.write("\n") ! print "Optimal cost is $%.1f with grey zone between %.1f and %.1f"%(bestcost,besthamcut,bestspamcut) if __name__=="__main__": --- 108,112 ---- bestcost=trial sys.stdout.write("\n") ! print "%s: Optimal cost is $%.1f with grey zone between %.1f and %.1f"%(fn,bestcost,besthamcut,bestspamcut) if __name__=="__main__": From hooft@users.sourceforge.net Sun Oct 13 20:17:49 2002 From: hooft@users.sourceforge.net (Rob W.W. Hooft) Date: Sun, 13 Oct 2002 12:17:49 -0700 Subject: [Spambayes-checkins] spambayes README.txt,1.35,1.36 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv12978 Modified Files: README.txt Log Message: added cvcost description Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** README.txt 9 Oct 2002 08:35:24 -0000 1.35 --- README.txt 13 Oct 2002 19:17:45 -0000 1.36 *************** *** 221,224 **** --- 221,230 ---- only under use_central_limit2 and use_central_limit3. + cvcost.py + A program that analyzes the output of timcv.py (the final histograms) + and optimizes the cost of handling the mail body by defining a "ham" + zone, a "spam" zone and a "grey" zone. It can be tuned by choosing + pseudo-realistic costs to handle a fp, a fn and to handle a message + in the grey zone. Standard Test Data Setup From tim_one@users.sourceforge.net Sun Oct 13 20:24:24 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 13 Oct 2002 12:24:24 -0700 Subject: [Spambayes-checkins] spambayes cvcost.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv14813 Modified Files: cvcost.py Log Message: Added horizontal whitespace, and split long lines, in accord with the Python style guide. Index: cvcost.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/cvcost.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** cvcost.py 13 Oct 2002 19:13:47 -0000 1.2 --- cvcost.py 13 Oct 2002 19:24:22 -0000 1.3 *************** *** 8,12 **** -h display this message and exit ! -u unknown-cost The cost to you of an uncertain message (Default $0.20) --- 8,12 ---- -h display this message and exit ! -u unknown-cost The cost to you of an uncertain message (Default $0.20) *************** *** 24,30 **** # Defaults ! unknowncost=0.2 ! fpcost=10 ! fncost=1 program = sys.argv[0] --- 24,30 ---- # Defaults ! unknowncost = 0.2 ! fpcost = 10 ! fncost = 1 program = sys.argv[0] *************** *** 39,112 **** def cost(spamhist,hamhist,hamcut,spamcut): ! spamok=0 ! spamunknown=0 ! spamnok=0 ! hamok=0 ! hamunknown=0 ! hamnok=0 ! for v,cnt in spamhist: ! if v Ham scores for all runs'): ! state=1 ! elif state==1: if line.startswith('*'): ! state=2 ! elif state==2: ! word=line.split() try: ! v=float(word[0]) ! cnt=int(word[1]) ! hamhist.append((v,cnt)) except IndexError: ! state=3 ! elif state==3: if line.startswith('*'): ! state=4 ! elif state==4: ! word=line.split() try: ! v=float(word[0]) ! cnt=int(word[1]) ! spamhist.append((v,cnt)) except ValueError: ! state=5 ! besthamcut=50 ! bestspamcut=80 ! bestcost=cost(spamhist,hamhist,besthamcut,bestspamcut) ! for hamcut in range(1,90): sys.stdout.write(".") sys.stdout.flush() ! for spamcut in range(max(51,hamcut),100): ! trial=cost(spamhist,hamhist,hamcut,spamcut) ! if trial<=bestcost: ! besthamcut=hamcut ! bestspamcut=spamcut ! bestcost=trial sys.stdout.write("\n") ! print "%s: Optimal cost is $%.1f with grey zone between %.1f and %.1f"%(fn,bestcost,besthamcut,bestspamcut) if __name__=="__main__": --- 39,114 ---- def cost(spamhist,hamhist,hamcut,spamcut): ! spamok = 0 ! spamunknown = 0 ! spamnok = 0 ! hamok = 0 ! hamunknown = 0 ! hamnok = 0 ! for v, cnt in spamhist: ! if v < hamcut: ! spamnok += cnt ! elif v < spamcut: ! spamunknown += cnt else: ! spamok += cnt ! for v, cnt in hamhist: ! if v < hamcut: ! hamok += cnt ! elif v < spamcut: ! hamunknown += cnt else: ! hamnok += cnt #print hamok,hamunknown,hamnok #print spamok,spamunknown,spamnok ! _cost = ((spamunknown + hamunknown) * unknowncost + ! fpcost*hamnok + fncost*spamnok) #print "At %.1f, %.1f, cost=%.1f"%(hamcut,spamcut,_cost) return _cost def main(fn): ! state = 0 ! hamhist = [] ! spamhist = [] for line in open(fn): ! if state == 0: if line.startswith('-> Ham scores for all runs'): ! state = 1 ! elif state == 1: if line.startswith('*'): ! state = 2 ! elif state == 2: ! word = line.split() try: ! v = float(word[0]) ! cnt = int(word[1]) ! hamhist.append((v, cnt)) except IndexError: ! state = 3 ! elif state == 3: if line.startswith('*'): ! state = 4 ! elif state == 4: ! word = line.split() try: ! v = float(word[0]) ! cnt = int(word[1]) ! spamhist.append((v, cnt)) except ValueError: ! state = 5 ! besthamcut = 50 ! bestspamcut = 80 ! bestcost = cost(spamhist, hamhist, besthamcut, bestspamcut) ! for hamcut in range(1, 90): sys.stdout.write(".") sys.stdout.flush() ! for spamcut in range(max(51, hamcut), 100): ! trial = cost(spamhist, hamhist, hamcut, spamcut) ! if trial <= bestcost: ! besthamcut = hamcut ! bestspamcut = spamcut ! bestcost = trial sys.stdout.write("\n") ! print "%s: Optimal cost is $%.1f with grey zone between %.1f and %.1f" % ( ! fn, bestcost, besthamcut, bestspamcut) if __name__=="__main__": *************** *** 114,119 **** try: ! opts, args = getopt.getopt(sys.argv[1:], 'p:n:u:', ! []) except getopt.error, msg: usage(1, msg) --- 116,120 ---- try: ! opts, args = getopt.getopt(sys.argv[1:], 'p:n:u:', []) except getopt.error, msg: usage(1, msg) *************** *** 129,135 **** unknowncost = float(arg) ! if unknowncost>=fncost or unknowncost>=fpcost: ! raise ValueError("This program requires that unknowns are cheaper than fp or fn") ! for fn in args: main(fn) --- 130,137 ---- unknowncost = float(arg) ! if unknowncost >= fncost or unknowncost >= fpcost: ! raise ValueError("This program requires that unknowns are cheaper " ! "than fp or fn") ! for fn in args: main(fn) From tim_one@users.sourceforge.net Sun Oct 13 20:25:44 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 13 Oct 2002 12:25:44 -0700 Subject: [Spambayes-checkins] spambayes README.txt,1.36,1.37 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv15409 Modified Files: README.txt Log Message: Mentioned chi2.py. Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** README.txt 13 Oct 2002 19:17:45 -0000 1.36 --- README.txt 13 Oct 2002 19:25:42 -0000 1.37 *************** *** 70,73 **** --- 70,76 ---- msgs. The test drivers use these. + chi2.py + A collection of statistics functions. + Apps *************** *** 224,228 **** A program that analyzes the output of timcv.py (the final histograms) and optimizes the cost of handling the mail body by defining a "ham" ! zone, a "spam" zone and a "grey" zone. It can be tuned by choosing pseudo-realistic costs to handle a fp, a fn and to handle a message in the grey zone. --- 227,231 ---- A program that analyzes the output of timcv.py (the final histograms) and optimizes the cost of handling the mail body by defining a "ham" ! zone, a "spam" zone and a "grey" zone. It can be tuned by choosing pseudo-realistic costs to handle a fp, a fn and to handle a message in the grey zone. From tim_one@users.sourceforge.net Sun Oct 13 20:26:39 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 13 Oct 2002 12:26:39 -0700 Subject: [Spambayes-checkins] spambayes cvcost.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv15589 Modified Files: cvcost.py Log Message: More whitespace. Index: cvcost.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/cvcost.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** cvcost.py 13 Oct 2002 19:24:22 -0000 1.3 --- cvcost.py 13 Oct 2002 19:26:31 -0000 1.4 *************** *** 38,42 **** sys.exit(code) ! def cost(spamhist,hamhist,hamcut,spamcut): spamok = 0 spamunknown = 0 --- 38,42 ---- sys.exit(code) ! def cost(spamhist, hamhist, hamcut, spamcut): spamok = 0 spamunknown = 0 From tim_one@users.sourceforge.net Sun Oct 13 20:34:50 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 13 Oct 2002 12:34:50 -0700 Subject: [Spambayes-checkins] spambayes cvcost.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv18018 Modified Files: cvcost.py Log Message: Documented the state-machine states, and simplified the state-machine exit. Index: cvcost.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/cvcost.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** cvcost.py 13 Oct 2002 19:26:31 -0000 1.4 --- cvcost.py 13 Oct 2002 19:34:48 -0000 1.5 *************** *** 72,81 **** --- 72,86 ---- for line in open(fn): if state == 0: + # Searching for start of 'all runs' ham histogram. if line.startswith('-> Ham scores for all runs'): state = 1 + elif state == 1: + # Searching for first bucket in ham histogram. if line.startswith('*'): state = 2 + elif state == 2: + # Parsing ham histogram bucket line. word = line.split() try: *************** *** 85,92 **** except IndexError: state = 3 elif state == 3: if line.startswith('*'): state = 4 ! elif state == 4: word = line.split() try: --- 90,102 ---- except IndexError: state = 3 + elif state == 3: + # Searchin for first bucket in spam histogram. if line.startswith('*'): state = 4 ! ! else: ! assert state == 4 ! # Parsing spam histogram bucket line. word = line.split() try: *************** *** 95,99 **** spamhist.append((v, cnt)) except ValueError: ! state = 5 besthamcut = 50 bestspamcut = 80 --- 105,111 ---- spamhist.append((v, cnt)) except ValueError: ! break ! ! besthamcut = 50 bestspamcut = 80 From tim_one@users.sourceforge.net Mon Oct 14 03:20:38 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 13 Oct 2002 19:20:38 -0700 Subject: [Spambayes-checkins] spambayes chi2.py,1.3,1.4 classifier.py,1.37,1.38 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv18141 Modified Files: chi2.py classifier.py Log Message: chi2_spamprob(): This is looking better all the time, so put more effort into it: + Switched to Rob Hooft's (S-H+1)/2 combination, instead of S/(S+H). This needed reflecting in various chi2.py utilities too. + Documented what this function is doing. + Sped it, avoiding all but two calls to log() per invocation (saving up to 298 log() calls per invocation). This should also be more accurate, but in a useless way (low-bit errors just don't matter here). Index: chi2.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/chi2.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** chi2.py 13 Oct 2002 18:59:57 -0000 1.3 --- chi2.py 14 Oct 2002 02:20:35 -0000 1.4 *************** *** 107,119 **** #print random ! def judge(ps, ln=_math.log): ! H = S = 0.0 for p in ps: ! S += ln(1.0 - p) ! H += ln(p) n = len(ps) S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) ! return S/(S+H) warp = 0 --- 107,128 ---- #print random ! def judge(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp): ! H = S = 1.0 ! Hexp = Sexp = 0 for p in ps: ! S *= 1.0 - p ! H *= p ! if S < 1e-200: ! S, e = frexp(S) ! Sexp += e ! if H < 1e-200: ! H, e = frexp(H) ! Hexp += e ! S = ln(S) + Sexp * ln2 ! H = ln(H) + Hexp * ln2 n = len(ps) S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) ! return S, H, (S-H + 1.0) / 2.0 warp = 0 *************** *** 125,143 **** h = Hist(20, lo=0.0, hi=1.0) for i in range(5000): ps = [random() for j in range(50)] ! p = judge(ps + [bias] * warp) ! h.add(p) print "Result for random vectors of 50 probs, +", warp, "forced to", bias print h.display() ! def showscore(ps, ln=_math.log): ! H = S = 0.0 for p in ps: ! S += ln(1.0 - p) ! H += ln(p) n = len(ps) --- 134,178 ---- h = Hist(20, lo=0.0, hi=1.0) + s = Hist(20, lo=0.0, hi=1.0) + score = Hist(20, lo=0.0, hi=1.0) for i in range(5000): ps = [random() for j in range(50)] ! s1, h1, score1 = judge(ps + [bias] * warp) ! s.add(s1) ! h.add(h1) ! score.add(score1) print "Result for random vectors of 50 probs, +", warp, "forced to", bias + + # Should be uniformly distributed on all-random data. print + print 'H', h.display() ! # Should be uniformly distributed on all-random data. ! print ! print 'S', ! s.display() ! ! # Distribution doesn't really matter. ! print ! print '(S-H+1)/2', ! score.display() ! ! def showscore(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp): ! H = S = 1.0 ! Hexp = Sexp = 0 for p in ps: ! S *= 1.0 - p ! H *= p ! if S < 1e-200: ! S, e = frexp(S) ! Sexp += e ! if H < 1e-200: ! H, e = frexp(H) ! Hexp += e ! S = ln(S) + Sexp * ln2 ! H = ln(H) + Hexp * ln2 n = len(ps) *************** *** 149,156 **** S = 1.0 - probS H = 1.0 - probH ! score = S/(S+H) print "spam prob", S print " ham prob", H ! print " S/(S+H)", score if __name__ == '__main__': --- 184,191 ---- S = 1.0 - probS H = 1.0 - probH ! score = (S-H + 1.0) / 2.0 print "spam prob", S print " ham prob", H ! print "(S-H+1)/2", score if __name__ == '__main__': Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** classifier.py 13 Oct 2002 19:06:08 -0000 1.37 --- classifier.py 14 Oct 2002 02:20:35 -0000 1.38 *************** *** 24,33 **** --- 24,37 ---- # This implementation is due to Tim Peters et alia. + import math import time from sets import Set from Options import options + if options.use_chi_squared_combining: from chi2 import chi2Q + LN2 = math.log(2) + if options.use_z_combining: from chi2 import normP, normIP *************** *** 453,456 **** --- 457,474 ---- spamprob = tim_spamprob + # Across vectors of length n, containing random uniformly-distributed + # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution + # with 2*n degrees of freedom. This has been proven (in some + # appropriate sense) to be the most sensitive possible test for + # rejecting the hypothesis that a vector of probabilities is uniformly + # distributed. Gary Robinson's original scheme was monotonic *with* + # this test, but skipped the details. Turns out that getting closer + # to the theoretical roots gives a much sharper classification, with + # a very small (in # of msgs), but also very broad (in range of scores), + # "middle ground", where most of the mistakes live. In particular, + # this scheme seems immune to all forms of "cancellation disease": if + # there are many strong ham *and* spam clues, this reliably scores + # close to 0.5. Most other schemes are extremely certain then -- and + # often wrong. def chi2_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. *************** *** 464,476 **** """ ! from math import log as ln - H = S = 0.0 clues = self._getclues(wordstream) for prob, word, record in clues: if record is not None: # else wordinfo doesn't know about it record.killcount += 1 ! S += ln(1.0 - prob) ! H += ln(prob) n = len(clues) --- 482,519 ---- """ ! from math import frexp, log as ln ! ! # We compute two chi-squared statistics, one for ham and one for ! # spam. The sum-of-the-logs business is more sensitive to probs ! # near 0 than to probs near 1, so the spam measure uses 1-p (so ! # that high-spamprob words have greatest effect), and the ham ! # measure uses p directly (so that lo-spamprob words have greatest ! # effect). ! # ! # For optimization, sum-of-logs == log-of-product, and f.p. ! # multiplication is a lot cheaper than calling ln(). It's easy ! # to underflow to 0.0, though, so we simulate unbounded dynamic ! # range via frexp. The real product H = this H * 2**Hexp, and ! # likewise the real product S = this S * 2**Sexp. ! H = S = 1.0 ! Hexp = Sexp = 0 clues = self._getclues(wordstream) for prob, word, record in clues: if record is not None: # else wordinfo doesn't know about it record.killcount += 1 ! S *= 1.0 - prob ! H *= prob ! if S < 1e-200: # prevent underflow ! S, e = frexp(S) ! Sexp += e ! if H < 1e-200: # prevent underflow ! H, e = frexp(H) ! Hexp += e ! ! # Compute the natural log of the product = sum of the logs: ! # ln(x * 2**i) = ln(x) + i * ln(2). ! S = ln(S) + Sexp * LN2 ! H = ln(H) + Hexp * LN2 n = len(clues) *************** *** 478,482 **** S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) ! prob = S/(S+H) else: prob = 0.5 --- 521,533 ---- S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) ! ! # How to combine these into a single spam score? We originally ! # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A ! # systematic problem is that we could end up being near-certain ! # a thing was (for example) spam, even if S was small, provided ! # that H was much smaller. ! # Rob Hooft stared at these problems and invented the measure ! # we use now, the simpler S-H, scaled into [0., 1.]. ! prob = (S-H + 1.0) / 2.0 else: prob = 0.5 From tim_one@users.sourceforge.net Mon Oct 14 18:14:07 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 14 Oct 2002 10:14:07 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.47,1.48 TestDriver.py,1.20,1.21 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv2819 Modified Files: Options.py TestDriver.py Log Message: Redid the histogram analysis to do the same kind of total-cost analysis done by cvcost.py. Note that the default nbuckets has been boosted to 200. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** Options.py 13 Oct 2002 18:59:56 -0000 1.47 --- Options.py 14 Oct 2002 17:13:47 -0000 1.48 *************** *** 109,130 **** # Number of buckets in histograms. ! nbuckets: 40 show_histograms: True # After the display of a ham+spam histogram pair, you can get a listing of ! # all the cutoff values (coinciding histogram bucket boundaries) that # minimize # ! # best_cutoff_fp_weight * (# false positives) + (# false negatives) # - # By default, best_cutoff_fp_weight is 1, and so the cutoffs that miminize - # the total number of misclassified messages (fp+fn) are shown. If you hate - # fp more than fn, set the weight to something larger than 1. For example, - # if you're willing to endure 100 false negatives to save 1 false positive, - # set it to 100. # Note: You may wish to increase nbuckets, to give this scheme more cutoff # values to analyze. compute_best_cutoffs_from_histograms: True ! best_cutoff_fp_weight: 1 # Display spam when --- 109,139 ---- # Number of buckets in histograms. ! nbuckets: 200 show_histograms: True # After the display of a ham+spam histogram pair, you can get a listing of ! # all the cutoff values (coinciding with histogram bucket boundaries) that # minimize # ! # best_cutoff_fp_weight * (# false positives) + ! # best_cutoff_fn_weight * (# false negatives) + ! # best_cutoff_unsure_weight * (# unsure msgs) ! # ! # This displays two cutoffs: hamc and spamc, where ! # ! # 0.0 <= hamc <= spamc <= 1.0 ! # ! # The idea is that if something scores < hamc, it's called ham; if ! # something scores >= spamc, it's called spam; and everything else is ! # called "I'm not sure" -- the middle ground. ! # ! # Note that cvcost.py does a similar analysis. # # Note: You may wish to increase nbuckets, to give this scheme more cutoff # values to analyze. compute_best_cutoffs_from_histograms: True ! best_cutoff_fp_weight: 10.00 ! best_cutoff_fn_weight: 1.00 ! best_cutoff_unsure_weight: 0.20 # Display spam when *************** *** 314,317 **** --- 323,328 ---- 'compute_best_cutoffs_from_histograms': boolean_cracker, 'best_cutoff_fp_weight': float_cracker, + 'best_cutoff_fn_weight': float_cracker, + 'best_cutoff_unsure_weight': float_cracker, }, 'CV Driver': {'build_each_classifier_from_scratch': boolean_cracker, Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** TestDriver.py 7 Oct 2002 04:41:58 -0000 1.20 --- TestDriver.py 14 Oct 2002 17:13:51 -0000 1.21 *************** *** 45,49 **** return ! # Figure out "the best" spam cutoff point, meaning the one that minimizes # the total number of misclassified msgs (other definitions are # certainly possible!). --- 45,51 ---- return ! # Figure out "the best" ham & spam cutoff points, meaning the ones that ! # minimize ! # num_fp * fp_weight + num_fn + fn_weight + num_unsure * unsure_weight # the total number of misclassified msgs (other definitions are # certainly possible!). *************** *** 52,86 **** # and every ham is a false positive. assert ham.nbuckets == spam.nbuckets ! fpw = options.best_cutoff_fp_weight ! fp = ham.n ! fn = 0 ! best_total = fpw * fp + fn ! bests = [(0, fp, fn)] ! for i in range(nbuckets): ! # When moving the cutoff beyond bucket i, the ham in bucket i ! # are redeemed, and the spam in bucket i become false negatives. ! fp -= ham.buckets[i] ! fn += spam.buckets[i] ! total = fpw * fp + fn ! if total <= best_total: ! if total < best_total: ! best_total = total ! bests = [] ! bests.append((i+1, fp, fn)) ! assert fp == 0 ! assert fn == spam.n ! i, fp, fn = bests.pop(0) ! print '-> best cutoff for', tag, float(i) / nbuckets ! print '-> with weighted total %g*%d fp + %d fn = %g' % ( ! fpw, fp, fn, best_total) ! print '-> fp rate %.3g%% fn rate %.3g%%' % ( ! fp * 1e2 / ham.n, fn * 1e2 / spam.n) ! for i, fp, fn in bests: ! print ('-> matched at %g with %d fp & %d fn; ' ! 'fp rate %.3g%%; fn rate %.3g%%' % ( ! float(i) / ham.nbuckets, fp, fn, ! fp * 1e2 / ham.n, fn * 1e2 / spam.n)) def printmsg(msg, prob, clues): --- 54,114 ---- # and every ham is a false positive. assert ham.nbuckets == spam.nbuckets ! n = ham.nbuckets ! FPW = options.best_cutoff_fp_weight ! FNW = options.best_cutoff_fn_weight ! UNW = options.best_cutoff_unsure_weight ! # Get running totals: {h,s}total[i] is # of ham/spam below bucket i ! htotal = [0] * (n+1) ! stotal = [0] * (n+1) ! for i in range(1, n+1): ! htotal[i] = htotal[i-1] + ham.buckets[i-1] ! stotal[i] = stotal[i-1] + spam.buckets[i-1] ! assert htotal[-1] == ham.n ! assert stotal[-1] == spam.n ! ! best_cost = 1e200 # infinity ! bests = [] # best h and s cutoffs ! ! for h in range(n+1): ! num_fn = stotal[h] ! fn_cost = num_fn * FNW ! for s in xrange(h, n+1): ! # ham 0:h correct ! # h:s unsure ! # s: FP ! # spam 0:h FN ! # h:s unsure ! # s: correct ! num_fp = htotal[-1] - htotal[s] ! num_un = htotal[s] - htotal[h] + stotal[s] - stotal[h] ! cost = num_fp * FPW + fn_cost + num_un * UNW ! if cost <= best_cost: ! if cost < best_cost: ! best_cost = cost ! bests = [] ! bests.append((h, s)) ! ! print '-> best cost $%.2f' % best_cost ! print '-> per-fp cost $%.2f; per-fn cost $%.2f; per-unsure cost $%.2f' % ( ! FPW, FNW, UNW) ! ! if len(bests) > 1: ! print '-> achieved at', len(bests), 'cutoff pairs' ! info = [('smallest ham & spam cutoffs', bests[0]), ! ('largest ham & spam cutoffs', bests[-1])] ! else: ! info = [('achieved at ham & spam cutoffs', bests[0])] + for tag, (h, s) in info: + print '-> %s %g & %g' % (tag, float(h)/n, float(s)/n) + num_fn = stotal[h] + num_fp = htotal[-1] - htotal[s] + num_unh = htotal[s] - htotal[h] + num_uns = stotal[s] - stotal[h] + print '-> fp %d; fn %d; unsure ham %d; unsure spam %d' % ( + num_fp, num_fn, num_unh, num_uns) + print '-> fp rate %.3g%%; fn rate %.3g%%' % ( + num_fp*1e2 / ham.n, num_fn*1e2 / spam.n) def printmsg(msg, prob, clues): From tim_one@users.sourceforge.net Mon Oct 14 18:40:44 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 14 Oct 2002 10:40:44 -0700 Subject: [Spambayes-checkins] spambayes TestDriver.py,1.21,1.22 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv12667 Modified Files: TestDriver.py Log Message: Add %-unsure rate to histogram analysis output. Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v retrieving revision 1.21 retrieving revision 1.22 diff -C2 -d -r1.21 -r1.22 *** TestDriver.py 14 Oct 2002 17:13:51 -0000 1.21 --- TestDriver.py 14 Oct 2002 17:40:39 -0000 1.22 *************** *** 109,114 **** print '-> fp %d; fn %d; unsure ham %d; unsure spam %d' % ( num_fp, num_fn, num_unh, num_uns) ! print '-> fp rate %.3g%%; fn rate %.3g%%' % ( ! num_fp*1e2 / ham.n, num_fn*1e2 / spam.n) def printmsg(msg, prob, clues): --- 109,115 ---- print '-> fp %d; fn %d; unsure ham %d; unsure spam %d' % ( num_fp, num_fn, num_unh, num_uns) ! print '-> fp rate %.3g%%; fn rate %.3g%%; unsure rate %.3g%%' % ( ! num_fp*1e2 / ham.n, num_fn*1e2 / spam.n, ! (num_unh + num_uns)*1e2 / (ham.n + spam.n)) def printmsg(msg, prob, clues): From tim_one@users.sourceforge.net Mon Oct 14 19:04:59 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 14 Oct 2002 11:04:59 -0700 Subject: [Spambayes-checkins] spambayes TestDriver.py,1.22,1.23 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv21633 Modified Files: TestDriver.py Log Message: Add tags ('for all runs', etc) to best_cost output lines. Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** TestDriver.py 14 Oct 2002 17:40:39 -0000 1.22 --- TestDriver.py 14 Oct 2002 18:04:56 -0000 1.23 *************** *** 90,94 **** bests.append((h, s)) ! print '-> best cost $%.2f' % best_cost print '-> per-fp cost $%.2f; per-fn cost $%.2f; per-unsure cost $%.2f' % ( FPW, FNW, UNW) --- 90,94 ---- bests.append((h, s)) ! print '-> best cost for %s $%.2f' % (tag, best_cost) print '-> per-fp cost $%.2f; per-fn cost $%.2f; per-unsure cost $%.2f' % ( FPW, FNW, UNW) From tim_one@users.sourceforge.net Tue Oct 15 01:16:57 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 14 Oct 2002 17:16:57 -0700 Subject: [Spambayes-checkins] spambayes chi2.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv473 Modified Files: chi2.py Log Message: normIP(): approximately double the number of accurate digits. Index: chi2.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/chi2.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** chi2.py 14 Oct 2002 02:20:35 -0000 1.4 --- chi2.py 15 Oct 2002 00:16:55 -0000 1.5 *************** *** 80,84 **** Must have 0.0 <= p <= 1.0. """ ! return normIQ(1.0 - p) def main(): --- 80,86 ---- Must have 0.0 <= p <= 1.0. """ ! z = normIQ(1.0 - p) ! # One Newton step should double the # of good digits. ! return z + (p - normP(z)) / normZ(z) def main(): From tim_one@users.sourceforge.net Wed Oct 16 22:07:07 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 16 Oct 2002 14:07:07 -0700 Subject: [Spambayes-checkins] spambayes chi2.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv5421 Modified Files: chi2.py Log Message: chi2Q(): Slight error in the platform exp(), and accumlated roundoff errors, could cause the result of this to spill a few ULP above 1.0, for large x2 and large v. This can in turn lead to "slightly negative" final spam scores (like -1.2e-15). Repaired that; negative final scores should never happen now. Index: chi2.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/chi2.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** chi2.py 15 Oct 2002 00:16:55 -0000 1.5 --- chi2.py 16 Oct 2002 21:07:05 -0000 1.6 *************** *** 1,5 **** import math as _math ! def chi2Q(x2, v, exp=_math.exp): """Return prob(chisq >= x2, with v degrees of freedom). --- 1,5 ---- import math as _math ! def chi2Q(x2, v, exp=_math.exp, min=min): """Return prob(chisq >= x2, with v degrees of freedom). *************** *** 7,10 **** --- 7,11 ---- """ assert v & 1 == 0 + # XXX Is x2 is very large, exp(-m) will underflow to 0. m = x2 / 2.0 sum = term = exp(-m) *************** *** 12,16 **** term *= m / i sum += term ! return sum def normZ(z, sqrt2pi=_math.sqrt(2.0*_math.pi), exp=_math.exp): --- 13,21 ---- term *= m / i sum += term ! # With small x2 and large v, accumulated roundoff error, plus error in ! # the platform exp(), can cause this to spill a few ULP above 1.0. For ! # example, chi2Q(100, 300) on my box has sum == 1.0 + 2.0**-52 at this ! # point. Returning a value even a teensy bit over 1.0 is no good. ! return min(sum, 1.0) def normZ(z, sqrt2pi=_math.sqrt(2.0*_math.pi), exp=_math.exp): From tim_one@users.sourceforge.net Wed Oct 16 22:31:21 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 16 Oct 2002 14:31:21 -0700 Subject: [Spambayes-checkins] spambayes chi2.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv27929 Modified Files: chi2.py Log Message: Typo repair in a comment. Index: chi2.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/chi2.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** chi2.py 16 Oct 2002 21:07:05 -0000 1.6 --- chi2.py 16 Oct 2002 21:31:19 -0000 1.7 *************** *** 7,11 **** """ assert v & 1 == 0 ! # XXX Is x2 is very large, exp(-m) will underflow to 0. m = x2 / 2.0 sum = term = exp(-m) --- 7,11 ---- """ assert v & 1 == 0 ! # XXX If x2 is very large, exp(-m) will underflow to 0. m = x2 / 2.0 sum = term = exp(-m) From tim_one@users.sourceforge.net Thu Oct 17 07:23:16 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 16 Oct 2002 23:23:16 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.48,1.49 TestDriver.py,1.23,1.24 Tester.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv1588 Modified Files: Options.py TestDriver.py Tester.py Log Message: Adapted from a patch by T. Alexander Popiel, teaching Tester and TestDriver about middle grounds. Note that there's a new option ham_cutoff! The range ham_cutoff:spam_cutoff defines the middle ground. Also repaired Tester's doctest, which started failing when we stopped counting words multiple times per msg in training. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** Options.py 14 Oct 2002 17:13:47 -0000 1.48 --- Options.py 17 Oct 2002 06:23:13 -0000 1.49 *************** *** 103,109 **** # These control various displays in class TestDriver.Driver, and Tester.Test. ! # A message is considered spam iff it scores greater than spam_cutoff. ! # This is corpus-dependent, and values into the .600's have been known # to work best on some data. spam_cutoff: 0.560 --- 103,120 ---- # These control various displays in class TestDriver.Driver, and Tester.Test. ! # spam_cutoff and ham_cutoff are used in Python slice sense: ! # A msg is considered ham if its score is in 0:ham_cutoff ! # A msg is considered unsure if its score is in ham_cutoff:spam_cutoff ! # A msg is considered spam if its score is in spam_cutoff: ! # ! # So it's unsure iff ham_cutoff <= score < spam_cutoff. ! # For a binary classifier, make ham_cutoff == spam_cutoff. ! # ham_cutoff > spam_cutoff doesn't make sense. ! # ! # The defaults are for the all-default Robinson scheme, which makes a ! # binary decision with no middle ground. The precise value that works ! # best is corpus-dependent, and values into the .600's have been known # to work best on some data. + ham_cutoff: 0.560 spam_cutoff: 0.560 *************** *** 147,150 **** --- 158,162 ---- show_false_positives: True show_false_negatives: False + show_unsure: False # Near the end of Driver.test(), you can get a listing of the 'best *************** *** 312,315 **** --- 324,328 ---- 'show_false_positives': boolean_cracker, 'show_false_negatives': boolean_cracker, + 'show_unsure': boolean_cracker, 'show_histograms': boolean_cracker, 'show_best_discriminators': int_cracker, *************** *** 318,321 **** --- 331,335 ---- 'pickle_basename': string_cracker, 'show_charlimit': int_cracker, + 'ham_cutoff': float_cracker, 'spam_cutoff': float_cracker, 'spam_directories': string_cracker, Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v retrieving revision 1.23 retrieving revision 1.24 diff -C2 -d -r1.23 -r1.24 *** TestDriver.py 14 Oct 2002 18:04:56 -0000 1.23 --- TestDriver.py 17 Oct 2002 06:23:13 -0000 1.24 *************** *** 129,132 **** --- 129,133 ---- self.falsepos = Set() self.falseneg = Set() + self.unsure = Set() self.global_ham_hist = Hist() self.global_spam_hist = Hist() *************** *** 187,190 **** --- 188,196 ---- if options.show_histograms: printhist("all runs:", self.global_ham_hist, self.global_spam_hist) + + print "-> cost for all runs: $%.2f" % ( + len(self.falsepos) * options.best_cutoff_fp_weight + + len(self.falseneg) * options.best_cutoff_fn_weight + + len(self.unsure) * options.best_cutoff_unsure_weight) if options.save_histogram_pickles: *************** *** 230,233 **** --- 236,245 ---- print "-> false positive %:", t.false_positive_rate() print "-> false negative %:", t.false_negative_rate() + print "-> unsure %:", t.unsure_rate() + print "-> cost: $%.2f" % ( + t.nham_wrong * options.best_cutoff_fp_weight + + t.nspam_wrong * options.best_cutoff_fn_weight + + (t.nham_unsure + t.nspam_unsure) * + options.best_cutoff_unsure_weight) newfpos = Set(t.false_positives()) - self.falsepos *************** *** 251,254 **** --- 263,278 ---- newfneg = () for e in newfneg: + print '*' * 78 + prob, clues = c.spamprob(e, True) + printmsg(e, prob, clues) + + newunsure = Set(t.unsures()) - self.unsure + self.unsure |= newunsure + print "-> %d new unsure" % len(newunsure) + if newunsure: + print " new unsure:", [e.tag for e in newunsure] + if not options.show_unsure: + newunsure = () + for e in newunsure: print '*' * 78 prob, clues = c.spamprob(e, True) Index: Tester.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Tester.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** Tester.py 27 Sep 2002 21:18:18 -0000 1.5 --- Tester.py 17 Oct 2002 06:23:13 -0000 1.6 *************** *** 36,45 **** --- 36,48 ---- self.nham_right = 0 self.nham_wrong = 0 + self.nham_unsure = 0; self.nspam_right = 0 self.nspam_wrong = 0 + self.nspam_unsure = 0; # Lists of bad predictions. self.ham_wrong_examples = [] # False positives: ham called spam. self.spam_wrong_examples = [] # False negatives: spam called ham. + self.unsure_examples = [] # ham and spam in middle ground # Train the classifier on streams of ham and spam. Updates probabilities *************** *** 85,107 **** if callback: callback(example, prob) ! is_spam_guessed = prob > options.spam_cutoff ! correct = is_spam_guessed == is_spam if is_spam: self.nspam_tested += 1 ! if correct: self.nspam_right += 1 ! else: self.nspam_wrong += 1 self.spam_wrong_examples.append(example) else: self.nham_tested += 1 ! if correct: self.nham_right += 1 ! else: self.nham_wrong += 1 self.ham_wrong_examples.append(example) ! assert self.nham_right + self.nham_wrong == self.nham_tested ! assert self.nspam_right + self.nspam_wrong == self.nspam_tested def false_positive_rate(self): --- 88,118 ---- if callback: callback(example, prob) ! is_ham_guessed = prob < options.ham_cutoff ! is_spam_guessed = prob >= options.spam_cutoff if is_spam: self.nspam_tested += 1 ! if is_spam_guessed: self.nspam_right += 1 ! elif is_ham_guessed: self.nspam_wrong += 1 self.spam_wrong_examples.append(example) + else: + self.nspam_unsure += 1 + self.unsure_examples.append(example) else: self.nham_tested += 1 ! if is_ham_guessed: self.nham_right += 1 ! elif is_spam_guessed: self.nham_wrong += 1 self.ham_wrong_examples.append(example) + else: + self.nham_unsure += 1 + self.unsure_examples.append(example) ! assert (self.nham_right + self.nham_wrong + self.nham_unsure == ! self.nham_tested) ! assert (self.nspam_right + self.nspam_wrong + self.nspam_unsure == ! self.nspam_tested) def false_positive_rate(self): *************** *** 113,116 **** --- 124,131 ---- return self.nspam_wrong * 1e2 / self.nspam_tested + def unsure_rate(self): + return ((self.nham_unsure + self.nspam_unsure) * 1e2 / + (self.nham_tested + self.nspam_tested)) + def false_positives(self): return self.ham_wrong_examples *************** *** 119,122 **** --- 134,139 ---- return self.spam_wrong_examples + def unsures(self): + return self.unsure_examples class _Example: *************** *** 129,146 **** _easy_test = """ >>> from classifier import Bayes ! >>> good1 = _Example('', ['a', 'b', 'c'] * 10) ! >>> good2 = _Example('', ['a', 'b'] * 10) ! >>> bad1 = _Example('', ['d'] * 10) >>> t = Test(Bayes()) >>> t.train([good1, good2], [bad1]) >>> t.predict([_Example('goodham', ['a', 'b']), ! ... _Example('badham', ['d']) ... ], False) ! >>> t.predict([_Example('goodspam', ['d', 'd']), ! ... _Example('badspam1', ['c']), ! ... _Example('badspam2', ['a'] * 15 + ['d'] * 1000), ! ... _Example('badspam3', ['d', 'a', 'b', 'c']) ... ], True) --- 146,165 ---- _easy_test = """ >>> from classifier import Bayes + >>> from Options import options + >>> options.ham_cutoff = options.spam_cutoff = 0.5 ! >>> good1 = _Example('', ['a', 'b', 'c']) ! >>> good2 = _Example('', ['a', 'b']) ! >>> bad1 = _Example('', ['c', 'd']) >>> t = Test(Bayes()) >>> t.train([good1, good2], [bad1]) >>> t.predict([_Example('goodham', ['a', 'b']), ! ... _Example('badham', ['d']) # FP ... ], False) ! >>> t.predict([_Example('goodspam', ['d']), ! ... _Example('badspam1', ['a']), # FN ! ... _Example('badspam2', ['a', 'b']), # FN ! ... _Example('badspam3', ['d', 'a', 'b']) # FN ... ], True) *************** *** 162,165 **** --- 181,189 ---- >>> [e.name for e in t.false_negatives()] ['badspam1', 'badspam2', 'badspam3'] + + >>> [e.name for e in t.unsures()] + [] + >>> t.unsure_rate() + 0.0 """ From npickett@users.sourceforge.net Thu Oct 17 19:19:44 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Thu, 17 Oct 2002 11:19:44 -0700 Subject: [Spambayes-checkins] spambayes hammiecli.py,NONE,1.1 hammiesrv.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv10132 Modified Files: hammiesrv.py Added Files: hammiecli.py Log Message: * hammiesrv actually works now :) * hammiecli uses it --- NEW FILE: hammiecli.py --- #! /usr/bin/env python """A client for hammiesrv. Just feed it your mail on stdin, and it spits out the same message with a new X-Hammie-Disposition header. """ import xmlrpclib import sys RPCBASE="http://localhost:65000" def main(): msg = sys.stdin.read() try: x = xmlrpclib.ServerProxy(RPCBASE) m = xmlrpclib.Binary(msg) out = x.filter(m) print out except: if __debug__: import traceback traceback.print_exc() print msg if __name__ == "__main__": main() Index: hammiesrv.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiesrv.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** hammiesrv.py 8 Oct 2002 17:38:21 -0000 1.5 --- hammiesrv.py 17 Oct 2002 18:19:41 -0000 1.6 *************** *** 36,52 **** class XMLHammie(hammie.Hammie): ! def score(self, msg, **kwargs): try: msg = msg.data except AttributeError: pass ! return hammie.Hammie.score(self, msg, **kwargs) ! def filter(self, msg, **kwargs): try: msg = msg.data except AttributeError: pass ! return hammie.Hammie.filter(self, msg, **kwargs) --- 36,66 ---- class XMLHammie(hammie.Hammie): ! def score(self, msg, *extra): try: msg = msg.data except AttributeError: pass ! score = hammie.Hammie.score ! if len(extra) == 0: ! return score(self, msg) ! elif len(extra) == 1: ! return score(self, msg, extra[0]) ! else: ! raise TypeError("Wrong number of arguments") ! def filter(self, msg, *extra): try: msg = msg.data except AttributeError: pass ! filter = hammie.Hammie.filter ! if len(extra) == 0: ! return filter(self, msg) ! elif len(extra) == 1: ! return filter(self, msg, extra[0]) ! elif len(extra) == 2: ! return filter(self, msg, extra[0], extra[1]) ! else: ! raise TypeError("Wrong number of arguments") *************** *** 72,75 **** --- 86,90 ---- response = (response,) except: + traceback.print_exc() # report exception back to server response = xmlrpclib.dumps( From npickett@users.sourceforge.net Thu Oct 17 22:30:16 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Thu, 17 Oct 2002 14:30:16 -0700 Subject: [Spambayes-checkins] spambayes hammiesrv.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv24789 Modified Files: hammiesrv.py Log Message: * Changed icky optional argument checking to nice *extra syntax (thanks, Guido!) Index: hammiesrv.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiesrv.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** hammiesrv.py 17 Oct 2002 18:19:41 -0000 1.6 --- hammiesrv.py 17 Oct 2002 21:30:13 -0000 1.7 *************** *** 41,51 **** except AttributeError: pass ! score = hammie.Hammie.score ! if len(extra) == 0: ! return score(self, msg) ! elif len(extra) == 1: ! return score(self, msg, extra[0]) ! else: ! raise TypeError("Wrong number of arguments") def filter(self, msg, *extra): --- 41,45 ---- except AttributeError: pass ! return hammie.Hammie.score(self, msg, *extra) def filter(self, msg, *extra): *************** *** 54,66 **** except AttributeError: pass ! filter = hammie.Hammie.filter ! if len(extra) == 0: ! return filter(self, msg) ! elif len(extra) == 1: ! return filter(self, msg, extra[0]) ! elif len(extra) == 2: ! return filter(self, msg, extra[0], extra[1]) ! else: ! raise TypeError("Wrong number of arguments") --- 48,52 ---- except AttributeError: pass ! return hammie.Hammie.filter(self, msg, *extra) From tim_one@users.sourceforge.net Fri Oct 18 06:04:49 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 17 Oct 2002 22:04:49 -0700 Subject: [Spambayes-checkins] spambayes TestDriver.py,1.24,1.25 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv16567 Modified Files: TestDriver.py Log Message: alldone(): More code from T. Alexander Popiel, to display lines for all-runs Pfn, fp, unsure} counts and rates. Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v retrieving revision 1.24 retrieving revision 1.25 diff -C2 -d -r1.24 -r1.25 *** TestDriver.py 17 Oct 2002 06:23:13 -0000 1.24 --- TestDriver.py 18 Oct 2002 05:04:46 -0000 1.25 *************** *** 188,196 **** if options.show_histograms: printhist("all runs:", self.global_ham_hist, self.global_spam_hist) ! ! print "-> cost for all runs: $%.2f" % ( ! len(self.falsepos) * options.best_cutoff_fp_weight + ! len(self.falseneg) * options.best_cutoff_fn_weight + ! len(self.unsure) * options.best_cutoff_unsure_weight) if options.save_histogram_pickles: --- 188,207 ---- if options.show_histograms: printhist("all runs:", self.global_ham_hist, self.global_spam_hist) ! ! nham = self.global_ham_hist.n ! nspam = self.global_spam_hist.n ! nfp = len(self.falsepos) ! nfn = len(self.falseneg) ! nun = len(self.unsure) ! print "-> all runs false positives:", nfp ! print "-> all runs false negatives:", nfn ! print "-> all runs unsure:", nun ! print "-> all runs false positive %:", (nfp * 1e2 / nham) ! print "-> all runs false negative %:", (nfn * 1e2 / nspam) ! print "-> all runs unsure %:", (nun * 1e2 / (nham + nspam)) ! print "-> all runs cost: $%.2f" % ( ! nfp * options.best_cutoff_fp_weight + ! nfn * options.best_cutoff_fn_weight + ! nun * options.best_cutoff_unsure_weight) if options.save_histogram_pickles: From tim_one@users.sourceforge.net Fri Oct 18 06:44:07 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 17 Oct 2002 22:44:07 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.49,1.50 README.txt,1.37,1.38 TestDriver.py,1.25,1.26 classifier.py,1.38,1.39 clgen.py,1.1,NONE clpik.py,1.1,NONE rmspik.py,1.4,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv25258 Modified Files: Options.py README.txt TestDriver.py classifier.py Removed Files: clgen.py clpik.py rmspik.py Log Message: Removed 4 combining schemes: use_central_limit use_central_limit2 use_central_limit3 use_z_combining The central limit schemes aimed at getting a useful middle ground, but chi-combining has proved to work better for that. The chi scheme doesn't require the troublesome "third training pass" either. z-combining was more like chi-combining, and worked well, but not as well as chi- combining; z-combining proved vulnerable to "cancellation disease", to which chi-combining seems all but immune. Removed supporting option zscore_ratio_cutoff. Removed various data attributes of class Bayes, unique to the central limit schemes. __getstate__ and __setstate__ had never been updated to save or restore them, so old pickles will still work fine. Removed method Bayes.compute_population_stats(), which constituted "the third training pass" unique to the central limit schemes. There's scant chance this will ever be needed again, since it was never clear how to make the 3-pass schemes practical over time. Gave the still-default combining scheme's method the name gary_spamprob, and made spamprob an alias for that by default. This allows to name each combining scheme explicitly in case you want to test using more than one (the others are named tim_spamprob and chi2_spamprob). In gary_spamprob, simplified the scaling of (P-Q)/(P+Q) into 0 .. 1, replacing the whole shebang with P/(P+Q). Same result, but a little faster. Removed files clgen.py, clpik.py, and rmspik.py. These were data generation and analysis tools unique to the central limit schemes. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.49 retrieving revision 1.50 diff -C2 -d -r1.49 -r1.50 *** Options.py 17 Oct 2002 06:23:13 -0000 1.49 --- Options.py 18 Oct 2002 05:44:04 -0000 1.50 *************** *** 199,209 **** # on. By default, it does this in a clever way, learning *and* unlearning # sets as it goes along, so that it never needs to train on N-1 sets in one ! # gulp after the first time. However, that can't always be done: in ! # particular, the central-limit schemes can't unlearn incrementally, and can ! # learn incrementally only via a form of cheating whose bad effects overall ! # aren't yet known. ! # So when desiring to run a central-limit test, set ! # build_each_classifier_from_scratch to true. This gives correct results, ! # but runs much slower than a CV driver usually runs. build_each_classifier_from_scratch: False --- 199,205 ---- # on. By default, it does this in a clever way, learning *and* unlearning # sets as it goes along, so that it never needs to train on N-1 sets in one ! # gulp after the first time. Setting this option true forces "one gulp ! # from-scratch" training every time. There used to be a set of combining ! # schemes that needed this, but now it's just in case you're paranoid . build_each_classifier_from_scratch: False *************** *** 238,253 **** robinson_minimum_prob_strength: 0.1 ! ########################################################################### ! # Speculative options for Gary Robinson's central-limit ideas. These may go ! # away, or a bunch of incompatible stuff above may go away. ! ! # For the default scheme, use "tim-combining" of probabilities. This has ! # no effect under the central-limit schemes. Tim-combining is a kind of ! # cross between Paul Graham's and Gary Robinson's combining schemes. Unlike ! # Paul's, it's never crazy-certain, and compared to Gary's, in Tim's tests it ! # greatly increased the spread between mean ham-scores and spam-scores, while ! # simultaneously decreasing the variance of both. Tim needed a higher ! # spam_cutoff value for best results, but spam_cutoff is less touchy ! # than under Gary-combining. use_tim_combining: False --- 234,244 ---- robinson_minimum_prob_strength: 0.1 ! # For the default scheme, use "tim-combining" of probabilities. Tim- ! # combining is a kind of cross between Paul Graham's and Gary Robinson's ! # combining schemes. Unlike Paul's, it's never crazy-certain, and compared ! # to Gary's, in Tim's tests it greatly increased the spread between mean ! # ham-scores and spam-scores, while simultaneously decreasing the variance ! # of both. Tim needed a higher spam_cutoff value for best results, but ! # spam_cutoff is less touchy than under Gary-combining. use_tim_combining: False *************** *** 262,300 **** # systematic drawback is that it's sensitive to *any* deviation from a # uniform distribution, regardless of whether that's actually evidence of ! # ham or spam. Rob Hooft may have a pragmatic cure for that (combine the ! # final S and H measures via (S-H+1)/2 instead of via S/(S+H)). use_chi_squared_combining: False - - # z_combining is a scheme Gary has discussed with me offline. I'll say more - # if it proves promising. In initial tests it was even more extreme than - # chi combining, but not always in a good way -- in particular, it appears - # as vulnerable to "cancellation disease" as Graham-combining, giving one - # spam in my corpus a score of 4.1e-14 (chi combining scored it 0.5). - use_z_combining: False - - # Use a central-limit approach for scoring. - # The number of extremes to use is given by max_discriminators (above). - # spam_cutoff should almost certainly be exactly 0.5 when using this approach. - # DO NOT run cross-validation tests when this is enabled! They'll deliver - # nonense, or, if you're lucky, will blow up with division by 0 or negative - # square roots. An NxN test grid should work fine. - use_central_limit: False - - # Same as use_central_limit, except takes logarithms of probabilities and - # probability complements (p and 1-p) instead. - use_central_limit2: False - use_central_limit3: False - - # For now, a central-limit scheme considers its decision "certain" if the - # ratio of the zscore with larger magnitude to the zscore with smaller - # magnitude exceeds zscore_ratio_cutoff. The value here is seat-of-the- - # pants for use_central_limit2; nothing is known about use_central_limit wrt - # this. - # For now, a central-limit scheme delivers just one of 4 scores: - # 0.00 -- certain it's ham - # 0.49 -- guesses ham but is unsure - # 0.51 -- guesses spam but is unsure - # 1.00 -- certain it's spam - zscore_ratio_cutoff: 1.9 """ --- 253,262 ---- # systematic drawback is that it's sensitive to *any* deviation from a # uniform distribution, regardless of whether that's actually evidence of ! # ham or spam. Rob Hooft alleviated that by combining the final S and H ! # measures via (S-H+1)/2 instead of via S/(S+H)). ! # In practice, it appears that setting ham_cutoff=0.05, and spam_cutoff=0.95, ! # does well across test sets; while these cutoffs are rarely optimal, they ! # get close to optimal. use_chi_squared_combining: False """ *************** *** 346,358 **** 'robinson_probability_s': float_cracker, 'robinson_minimum_prob_strength': float_cracker, - - 'use_central_limit': boolean_cracker, - 'use_central_limit2': boolean_cracker, - 'use_central_limit3': boolean_cracker, - 'zscore_ratio_cutoff': float_cracker, - 'use_tim_combining': boolean_cracker, 'use_chi_squared_combining': boolean_cracker, - 'use_z_combining': boolean_cracker, }, } --- 308,313 ---- Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** README.txt 13 Oct 2002 19:25:42 -0000 1.37 --- README.txt 18 Oct 2002 05:44:04 -0000 1.38 *************** *** 118,125 **** remaining set (the set not used to train the classifier). mboxtest does the same. - timcv should not be used for central limit tests (timcv does - incremental learning and unlearning, for efficiency; the central - limit schemes can't unlearn incrementally, and their incremental - learning ability is a cheat whose badness isn't yet known). This (or mboxtest) is the preferred way to test when possible: it makes best use of limited data, and interpreting results is --- 118,121 ---- *************** *** 140,144 **** because each msg is predicted against N-1 times overall. So, e.g., one terribly difficult spam or ham can count against you N-1 times. - Central limit tests are fine with timtest. --- 136,139 ---- *************** *** 205,227 **** Experimental Files ================== - clgen.py - A test driver only for use with one of the speculative central-limit - schemes. Its purpose is to generate a binary pickle containing - internal information about every prediction made. This will go - away someday. - - clpik.py - An example analysis program showing how to access the pickles - produced by clgen.py, and how to generate potentially interesting - histograms from them. - - rmspik.py - A program that analyzes a clgen-produced pickle, and tells you what - would happen if we had used Rob Hooft's "RMS ZScore" scheme for - deciding certainty instead. - CAUTION: This doesn't work as intended for plain use_central_limit. - The chance() function seems to make an assumption that's true - only under use_central_limit2 and use_central_limit3. - cvcost.py A program that analyzes the output of timcv.py (the final histograms) --- 200,203 ---- *************** *** 230,233 **** --- 206,210 ---- pseudo-realistic costs to handle a fp, a fn and to handle a message in the grey zone. + Standard Test Data Setup Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** TestDriver.py 18 Oct 2002 05:04:46 -0000 1.25 --- TestDriver.py 18 Oct 2002 05:44:05 -0000 1.26 *************** *** 141,152 **** self.trained_spam_hist = Hist() - # CAUTION: When options.use_central_limit{,2,3} is in effect, this - # adds the new population statistics to the existing population statistics - # (if any), but the existing population statistics are no longer correct - # due to the new data we just added (which can change spamprobs, and - # even the *set* of extreme words). There's no thoroughly correct way - # to repair this short of recomputing the population statistics for - # every msg *ever* trained on. It's currently unknown how badly this - # cheat may affect results. def train(self, ham, spam): print "-> Training on", ham, "&", spam, "...", --- 141,144 ---- *************** *** 155,163 **** self.tester.train(ham, spam) print c.nham - nham, "hams &", c.nspam- nspam, "spams" - c.compute_population_stats(ham, False) - c.compute_population_stats(spam, True) - # CAUTION: this doesn't work at all for incrememental training when - # options.use_central_limit{,2,3} is in effect. def untrain(self, ham, spam): print "-> Forgetting", ham, "&", spam, "...", --- 147,151 ---- Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** classifier.py 14 Oct 2002 02:20:35 -0000 1.38 --- classifier.py 18 Oct 2002 05:44:05 -0000 1.39 *************** *** 34,40 **** LN2 = math.log(2) - if options.use_z_combining: - from chi2 import normP, normIP - # The maximum number of extreme words to look at in a msg, where "extreme" # means with spamprob farthest away from 0.5. --- 34,37 ---- *************** *** 86,113 **** 'nspam', # number of spam messages learn() has seen 'nham', # number of non-spam messages learn() has seen - - # The rest is unique to the central-limit code. - # n is the # of data points in the population. - # sum is the sum of the probabilities, and is a long scaled - # by 2**64. - # sumsq is the sum of the squares of the probabilities, and - # is a long scaled by 2**128. - # mean is the mean probability of the population, as an - # unscaled float. - # var is the variance of the population, as unscaled float. - # There's one set of these for the spam population, and - # another for the ham population. - # XXX If this code survives, clean it up. - 'spamn', - 'spamsum', - 'spamsumsq', - 'spammean', - 'spamvar', - - 'hamn', - 'hamsum', - 'hamsumsq', - 'hammean', - 'hamvar', ) --- 83,86 ---- *************** *** 115,121 **** self.wordinfo = {} self.nspam = self.nham = 0 - self.spamn = self.hamn = 0 - self.spamsum = self.spamsumsq = 0 - self.hamsum = self.hamsumsq = 0 def __getstate__(self): --- 88,91 ---- *************** *** 127,131 **** self.wordinfo, self.nspam, self.nham = t[1:] ! def spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. --- 97,101 ---- self.wordinfo, self.nspam, self.nham = t[1:] ! def gary_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. *************** *** 180,185 **** Q = 1.0 - Q**n * 2.0**(Qexp * n) ! prob = (P-Q)/(P+Q) # in -1 .. 1 ! prob = 0.5 + prob/2 # shift to 0 .. 1 else: prob = 0.5 --- 150,159 ---- Q = 1.0 - Q**n * 2.0**(Qexp * n) ! # (P-Q)/(P+Q) is in -1 .. 1; scaling into 0 .. 1 gives ! # ((P-Q)/(P+Q)+1)/2 = ! # ((P-Q+P-Q)/(P+Q)/2 = ! # (2*P/(P+Q)/2 = ! # P/(P+Q) ! prob = P/(P+Q) else: prob = 0.5 *************** *** 192,195 **** --- 166,171 ---- return prob + spamprob = gary_spamprob # may be replaced later + def learn(self, wordstream, is_spam, update_probabilities=True): """Teach the classifier by example. *************** *** 357,363 **** del self.wordinfo[word] - def compute_population_stats(self, msgstream, is_spam): - pass - def _getclues(self, wordstream): mindist = options.robinson_minimum_prob_strength --- 333,336 ---- *************** *** 544,803 **** if options.use_chi_squared_combining: spamprob = chi2_spamprob - - def z_spamprob(self, wordstream, evidence=False): - """Return best-guess probability that wordstream is spam. - - wordstream is an iterable object producing words. - The return value is a float in [0.0, 1.0]. - - If optional arg evidence is True, the return value is a pair - probability, evidence - where evidence is a list of (word, probability) pairs. - """ - - from math import sqrt - - clues = self._getclues(wordstream) - zsum = 0.0 - for prob, word, record in clues: - if record is not None: # else wordinfo doesn't know about it - record.killcount += 1 - zsum += normIP(prob) - - n = len(clues) - if n: - # We've added n zscores from a unit normal distribution. By the - # central limit theorem, their mean is normally distributed with - # mean 0 and sdev 1/sqrt(n). So the zscore of zsum/n is - # (zsum/n - 0)/(1/sqrt(n)) = zsum/n/(1/sqrt(n)) = zsum/sqrt(n). - prob = normP(zsum / sqrt(n)) - else: - prob = 0.5 - - if evidence: - clues = [(w, p) for p, w, r in clues] - clues.sort(lambda a, b: cmp(a[1], b[1])) - clues.insert(0, ('*zsum*', zsum)) - clues.insert(0, ('*n*', n)) - clues.insert(0, ('*zscore*', zsum / sqrt(n or 1))) - return prob, clues - else: - return prob - - if options.use_z_combining: - spamprob = z_spamprob - - def _add_popstats(self, sum, sumsq, n, is_spam): - from math import ldexp - - if is_spam: - sum += self.spamsum - sumsq += self.spamsumsq - n += self.spamn - self.spamsum, self.spamsumsq, self.spamn = sum, sumsq, n - else: - sum += self.hamsum - sumsq += self.hamsumsq - n += self.hamn - self.hamsum, self.hamsumsq, self.hamn = sum, sumsq, n - - mean = ldexp(sum, -64) / n - var = sumsq * n - sum**2 - var = ldexp(var, -128) / n**2 - - if is_spam: - self.spammean, self.spamvar = mean, var - else: - self.hammean, self.hamvar = mean, var - - def central_limit_compute_population_stats(self, msgstream, is_spam): - from math import ldexp - - sum = sumsq = 0 - seen = {} - for msg in msgstream: - for prob, word, record in self._getclues(msg): - if word in seen: - continue - seen[word] = 1 - prob = long(ldexp(prob, 64)) - sum += prob - sumsq += prob * prob - - self._add_popstats(sum, sumsq, len(seen), is_spam) - - if options.use_central_limit: - compute_population_stats = central_limit_compute_population_stats - - def central_limit_spamprob(self, wordstream, evidence=False): - """Return best-guess probability that wordstream is spam. - - wordstream is an iterable object producing words. - The return value is a float in [0.0, 1.0]. - - If optional arg evidence is True, the return value is a pair - probability, evidence - where evidence is a list of (word, probability) pairs. - """ - - from math import sqrt - - clues = self._getclues(wordstream) - sum = 0.0 - for prob, word, record in clues: - sum += prob - if record is not None: - record.killcount += 1 - n = len(clues) - if n == 0: - return 0.5 - mean = sum / n - - # If this sample is drawn from the spam population, its mean is - # distributed around spammean with variance spamvar/n. Likewise - # for if it's drawn from the ham population. Compute a normalized - # z-score (how many stddevs is it away from the population mean?) - # against both populations, and then it's ham or spam depending - # on which population it matches better. - zham = (mean - self.hammean) / sqrt(self.hamvar / n) - zspam = (mean - self.spammean) / sqrt(self.spamvar / n) - delta = abs(zham) - abs(zspam) # > 0 for spam, < 0 for ham - - azham, azspam = abs(zham), abs(zspam) - if azham < azspam: - ratio = azspam / max(azham, 1e-10) # guard against 0 division - else: - ratio = azham / max(azspam, 1e-10) # guard against 0 division - certain = ratio > options.zscore_ratio_cutoff - - if certain: - score = delta > 0.0 and 1.0 or 0.0 - else: - score = delta > 0.0 and 0.51 or 0.49 - - if evidence: - clues = [(word, prob) for prob, word, record in clues] - clues.sort(lambda a, b: cmp(a[1], b[1])) - extra = [('*zham*', zham), - ('*zspam*', zspam), - ('*hmean*', mean), - ('*smean*', mean), - ('*n*', n), - ] - clues[0:0] = extra - return score, clues - else: - return score - - if options.use_central_limit: - spamprob = central_limit_spamprob - - def central_limit_compute_population_stats2(self, msgstream, is_spam): - from math import ldexp, log - - sum = sumsq = 0 - seen = {} - for msg in msgstream: - for prob, word, record in self._getclues(msg): - if word in seen: - continue - seen[word] = 1 - if is_spam: - prob = log(prob) - else: - prob = log(1.0 - prob) - prob = long(ldexp(prob, 64)) - sum += prob - sumsq += prob * prob - - self._add_popstats(sum, sumsq, len(seen), is_spam) - - if options.use_central_limit2: - compute_population_stats = central_limit_compute_population_stats2 - - def central_limit_spamprob2(self, wordstream, evidence=False): - """Return best-guess probability that wordstream is spam. - - wordstream is an iterable object producing words. - The return value is a float in [0.0, 1.0]. - - If optional arg evidence is True, the return value is a pair - probability, evidence - where evidence is a list of (word, probability) pairs. - """ - - from math import sqrt, log - - clues = self._getclues(wordstream) - hsum = ssum = 0.0 - for prob, word, record in clues: - ssum += log(prob) - hsum += log(1.0 - prob) - if record is not None: - record.killcount += 1 - n = len(clues) - if n == 0: - return 0.5 - hmean = hsum / n - smean = ssum / n - - # If this sample is drawn from the spam population, its mean is - # distributed around spammean with variance spamvar/n. Likewise - # for if it's drawn from the ham population. Compute a normalized - # z-score (how many stddevs is it away from the population mean?) - # against both populations, and then it's ham or spam depending - # on which population it matches better. - zham = (hmean - self.hammean) / sqrt(self.hamvar / n) - zspam = (smean - self.spammean) / sqrt(self.spamvar / n) - delta = abs(zham) - abs(zspam) # > 0 for spam, < 0 for ham - - azham, azspam = abs(zham), abs(zspam) - if azham < azspam: - ratio = azspam / max(azham, 1e-10) # guard against 0 division - else: - ratio = azham / max(azspam, 1e-10) # guard against 0 division - certain = ratio > options.zscore_ratio_cutoff - - if certain: - score = delta > 0.0 and 1.0 or 0.0 - else: - score = delta > 0.0 and 0.51 or 0.49 - - if evidence: - clues = [(word, prob) for prob, word, record in clues] - clues.sort(lambda a, b: cmp(a[1], b[1])) - extra = [('*zham*', zham), - ('*zspam*', zspam), - ('*hmean*', hmean), - ('*smean*', smean), - ('*n*', n), - ] - clues[0:0] = extra - return score, clues - else: - return score - - if options.use_central_limit2 or options.use_central_limit3: - spamprob = central_limit_spamprob2 - - def central_limit_compute_population_stats3(self, msgstream, is_spam): - from math import ldexp, log - - sum = sumsq = n = 0 - for msg in msgstream: - n += 1 - probsum = 0.0 - clues = self._getclues(msg) - for prob, word, record in clues: - if is_spam: - probsum += log(prob) - else: - probsum += log(1.0 - prob) - mean = long(ldexp(probsum / len(clues), 64)) - sum += mean - sumsq += mean * mean - - self._add_popstats(sum, sumsq, n, is_spam) - - if options.use_central_limit3: - compute_population_stats = central_limit_compute_population_stats3 --- 517,518 ---- --- clgen.py DELETED --- --- clpik.py DELETED --- --- rmspik.py DELETED --- From tim_one@users.sourceforge.net Fri Oct 18 07:58:57 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 17 Oct 2002 23:58:57 -0700 Subject: [Spambayes-checkins] spambayes Histogram.py,1.5,1.6 Options.py,1.50,1.51 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv14705 Modified Files: Histogram.py Options.py Log Message: Patch inspired by Rob Hooft: new option "percentiles", giving a list of percentile points to compute and display with histograms. Index: Histogram.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Histogram.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** Histogram.py 8 Oct 2002 18:13:49 -0000 1.5 --- Histogram.py 18 Oct 2002 06:58:55 -0000 1.6 *************** *** 30,33 **** --- 30,34 ---- # median midpoint # mean + # pct list of (percentile, score) pairs # var variance # sdev population standard deviation (sqrt(variance)) *************** *** 66,69 **** --- 67,88 ---- self.var = var / n self.sdev = math.sqrt(self.var) + # Compute percentiles. + self.pct = pct = [] + for p in options.percentiles: + assert 0.0 <= p <= 100.0 + # In going from data index 0 to index n-1, we move n-1 times. + # p% of that is (n-1)*p/100. + i = (n-1)*p/1e2 + if i < 0: + # Just return the smallest. + score = data[0] + else: + whole = int(i) + frac = i - whole + score = data[whole] + if whole < n-1 and frac: + # Move frac of the way from this score to the next. + score += frac * (data[whole + 1] - score) + pct.append((p, score)) # Merge other into self. *************** *** 125,128 **** --- 144,150 ---- self.median, self.max) + pcts = ['%g%% %g' % x for x in self.pct] + print "-> percentiles:", '; '.join(pcts) + lo, hi = self.get_lo_hi() if lo > hi: Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.50 retrieving revision 1.51 diff -C2 -d -r1.50 -r1.51 *** Options.py 18 Oct 2002 05:44:04 -0000 1.50 --- Options.py 18 Oct 2002 06:58:55 -0000 1.51 *************** *** 148,151 **** --- 148,157 ---- best_cutoff_unsure_weight: 0.20 + # Histogram analysis also displays percentiles. For each percentile p + # in the list, the score S such that p% of all scores are <= S is given. + # Note that percentile 50 is the median, and is displayed (along with the + # min score and max score) independent of this option. + percentiles: 5 25 75 95 + # Display spam when # show_spam_lo <= spamprob <= show_spam_hi *************** *** 288,291 **** --- 294,298 ---- 'show_unsure': boolean_cracker, 'show_histograms': boolean_cracker, + 'percentiles': ('get', lambda s: map(float, s.split())), 'show_best_discriminators': int_cracker, 'save_trained_pickles': boolean_cracker, From tim_one@users.sourceforge.net Fri Oct 18 19:30:41 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 18 Oct 2002 11:30:41 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.51,1.52 classifier.py,1.39,1.40 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv29796 Modified Files: Options.py classifier.py Log Message: New options to play with melding gary- and chi- combining; see mailing list; promising! Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.51 retrieving revision 1.52 diff -C2 -d -r1.51 -r1.52 *** Options.py 18 Oct 2002 06:58:55 -0000 1.51 --- Options.py 18 Oct 2002 18:30:38 -0000 1.52 *************** *** 265,268 **** --- 265,272 ---- # get close to optimal. use_chi_squared_combining: False + + # Use a weighted average of chi-combining and gary-combining. + use_mixed_combining: False + mixed_combining_chi_weight: 0.9 """ *************** *** 317,320 **** --- 321,327 ---- 'use_tim_combining': boolean_cracker, 'use_chi_squared_combining': boolean_cracker, + + 'use_mixed_combining': boolean_cracker, + 'mixed_combining_chi_weight': float_cracker, }, } Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** classifier.py 18 Oct 2002 05:44:05 -0000 1.39 --- classifier.py 18 Oct 2002 18:30:38 -0000 1.40 *************** *** 30,34 **** from Options import options ! if options.use_chi_squared_combining: from chi2 import chi2Q LN2 = math.log(2) --- 30,34 ---- from Options import options ! if options.use_chi_squared_combining or options.use_mixed_combining: from chi2 import chi2Q LN2 = math.log(2) *************** *** 517,518 **** --- 517,610 ---- if options.use_chi_squared_combining: spamprob = chi2_spamprob + + def mixed_spamprob(self, wordstream, evidence=False): + """Return best-guess probability that wordstream is spam. + + wordstream is an iterable object producing words. + The return value is a float in [0.0, 1.0]. + + If optional arg evidence is True, the return value is a pair + probability, evidence + where evidence is a list of (word, probability) pairs. + """ + + from math import frexp, log as ln + + # We compute two chi-squared statistics, one for ham and one for + # spam. The sum-of-the-logs business is more sensitive to probs + # near 0 than to probs near 1, so the spam measure uses 1-p (so + # that high-spamprob words have greatest effect), and the ham + # measure uses p directly (so that lo-spamprob words have greatest + # effect). + # + # For optimization, sum-of-logs == log-of-product, and f.p. + # multiplication is a lot cheaper than calling ln(). It's easy + # to underflow to 0.0, though, so we simulate unbounded dynamic + # range via frexp. The real product H = this H * 2**Hexp, and + # likewise the real product S = this S * 2**Sexp. + H = S = 1.0 + Hexp = Sexp = 0 + + clues = self._getclues(wordstream) + for prob, word, record in clues: + if record is not None: # else wordinfo doesn't know about it + record.killcount += 1 + S *= 1.0 - prob + H *= prob + if S < 1e-200: # prevent underflow + S, e = frexp(S) + Sexp += e + if H < 1e-200: # prevent underflow + H, e = frexp(H) + Hexp += e + + n = len(clues) + if n: + #P = 1.0 - P**(1./num_clues) + #Q = 1.0 - Q**(1./num_clues) + # + # (x*2**e)**n = x**n * 2**(e*n) + nrecip = 1.0 / n + P = 1.0 - S**nrecip * 2.0**(Sexp * nrecip) + Q = 1.0 - H**nrecip * 2.0**(Hexp * nrecip) + + # Compute the natural log of the product = sum of the logs: + # ln(x * 2**i) = ln(x) + i * ln(2). + S = ln(S) + Sexp * LN2 + H = ln(H) + Hexp * LN2 + + S = 1.0 - chi2Q(-2.0 * S, 2*n) + H = 1.0 - chi2Q(-2.0 * H, 2*n) + + else: + P = Q = S = H = 1.0 + + gary_score = P/(P+Q) + + # How to combine these into a single spam score? We originally + # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A + # systematic problem is that we could end up being near-certain + # a thing was (for example) spam, even if S was small, provided + # that H was much smaller. + # Rob Hooft stared at these problems and invented the measure + # we use now, the simpler S-H, scaled into [0., 1.]. + chi_score = (S-H + 1.0) / 2.0 + + x = options.mixed_combining_chi_weight + prob = x * chi_score + (1.0 - x) * gary_score + + if evidence: + clues = [(w, p) for p, w, r in clues] + clues.sort(lambda a, b: cmp(a[1], b[1])) + clues.insert(0, ('*P*', P)) + clues.insert(0, ('*Q*', Q)) + clues.insert(0, ('*S*', S)) + clues.insert(0, ('*H*', H)) + clues.insert(0, ('*chi_score*', chi_score)) + clues.insert(0, ('*gary_score*', gary_score)) + return prob, clues + else: + return prob + + if options.use_mixed_combining: + spamprob = mixed_spamprob From tim_one@users.sourceforge.net Fri Oct 18 22:38:18 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 18 Oct 2002 14:38:18 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.52,1.53 classifier.py,1.40,1.41 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv29331 Modified Files: Options.py classifier.py Log Message: Dropped tim_combining. Rearranged the remaining combining schemes to get the code close together. Added '*n*' pseudo-clue to mixed_combining clue list. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.52 retrieving revision 1.53 diff -C2 -d -r1.52 -r1.53 *** Options.py 18 Oct 2002 18:30:38 -0000 1.52 --- Options.py 18 Oct 2002 21:38:16 -0000 1.53 *************** *** 240,252 **** robinson_minimum_prob_strength: 0.1 - # For the default scheme, use "tim-combining" of probabilities. Tim- - # combining is a kind of cross between Paul Graham's and Gary Robinson's - # combining schemes. Unlike Paul's, it's never crazy-certain, and compared - # to Gary's, in Tim's tests it greatly increased the spread between mean - # ham-scores and spam-scores, while simultaneously decreasing the variance - # of both. Tim needed a higher spam_cutoff value for best results, but - # spam_cutoff is less touchy than under Gary-combining. - use_tim_combining: False - # For vectors of random, uniformly distributed probabilities, -2*sum(ln(p_i)) # follows the chi-squared distribution with 2*n degrees of freedom. That's --- 240,243 ---- *************** *** 319,323 **** 'robinson_probability_s': float_cracker, 'robinson_minimum_prob_strength': float_cracker, - 'use_tim_combining': boolean_cracker, 'use_chi_squared_combining': boolean_cracker, --- 310,313 ---- Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** classifier.py 18 Oct 2002 18:30:38 -0000 1.40 --- classifier.py 18 Oct 2002 21:38:16 -0000 1.41 *************** *** 97,100 **** --- 97,103 ---- self.wordinfo, self.nspam, self.nham = t[1:] + # spamprob() implementations. One of the following is aliased to + # spamprob, depending on option settings. + def gary_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. *************** *** 166,170 **** return prob ! spamprob = gary_spamprob # may be replaced later def learn(self, wordstream, is_spam, update_probabilities=True): --- 169,332 ---- return prob ! spamprob = gary_spamprob # may be replaced by one of the next ones ! ! # Across vectors of length n, containing random uniformly-distributed ! # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution ! # with 2*n degrees of freedom. This has been proven (in some ! # appropriate sense) to be the most sensitive possible test for ! # rejecting the hypothesis that a vector of probabilities is uniformly ! # distributed. Gary Robinson's original scheme was monotonic *with* ! # this test, but skipped the details. Turns out that getting closer ! # to the theoretical roots gives a much sharper classification, with ! # a very small (in # of msgs), but also very broad (in range of scores), ! # "middle ground", where most of the mistakes live. In particular, ! # this scheme seems immune to all forms of "cancellation disease": if ! # there are many strong ham *and* spam clues, this reliably scores ! # close to 0.5. Most other schemes are extremely certain then -- and ! # often wrong. ! def chi2_spamprob(self, wordstream, evidence=False): ! """Return best-guess probability that wordstream is spam. ! ! wordstream is an iterable object producing words. ! The return value is a float in [0.0, 1.0]. ! ! If optional arg evidence is True, the return value is a pair ! probability, evidence ! where evidence is a list of (word, probability) pairs. ! """ ! ! from math import frexp, log as ln ! ! # We compute two chi-squared statistics, one for ham and one for ! # spam. The sum-of-the-logs business is more sensitive to probs ! # near 0 than to probs near 1, so the spam measure uses 1-p (so ! # that high-spamprob words have greatest effect), and the ham ! # measure uses p directly (so that lo-spamprob words have greatest ! # effect). ! # ! # For optimization, sum-of-logs == log-of-product, and f.p. ! # multiplication is a lot cheaper than calling ln(). It's easy ! # to underflow to 0.0, though, so we simulate unbounded dynamic ! # range via frexp. The real product H = this H * 2**Hexp, and ! # likewise the real product S = this S * 2**Sexp. ! H = S = 1.0 ! Hexp = Sexp = 0 ! ! clues = self._getclues(wordstream) ! for prob, word, record in clues: ! if record is not None: # else wordinfo doesn't know about it ! record.killcount += 1 ! S *= 1.0 - prob ! H *= prob ! if S < 1e-200: # prevent underflow ! S, e = frexp(S) ! Sexp += e ! if H < 1e-200: # prevent underflow ! H, e = frexp(H) ! Hexp += e ! ! # Compute the natural log of the product = sum of the logs: ! # ln(x * 2**i) = ln(x) + i * ln(2). ! S = ln(S) + Sexp * LN2 ! H = ln(H) + Hexp * LN2 ! ! n = len(clues) ! if n: ! S = 1.0 - chi2Q(-2.0 * S, 2*n) ! H = 1.0 - chi2Q(-2.0 * H, 2*n) ! ! # How to combine these into a single spam score? We originally ! # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A ! # systematic problem is that we could end up being near-certain ! # a thing was (for example) spam, even if S was small, provided ! # that H was much smaller. ! # Rob Hooft stared at these problems and invented the measure ! # we use now, the simpler S-H, scaled into [0., 1.]. ! prob = (S-H + 1.0) / 2.0 ! else: ! prob = 0.5 ! ! if evidence: ! clues = [(w, p) for p, w, r in clues] ! clues.sort(lambda a, b: cmp(a[1], b[1])) ! clues.insert(0, ('*S*', S)) ! clues.insert(0, ('*H*', H)) ! return prob, clues ! else: ! return prob ! ! if options.use_chi_squared_combining: ! spamprob = chi2_spamprob ! ! # This is a weighted average of the other two. In extreme cases, they ! # often seem to disagree on how "certain" they are. Mixing softens ! # the extremes, pushing even some very hard cases into the middle ground. ! def mixed_spamprob(self, wordstream, evidence=False): ! """Return best-guess probability that wordstream is spam. ! ! wordstream is an iterable object producing words. ! The return value is a float in [0.0, 1.0]. ! ! If optional arg evidence is True, the return value is a pair ! probability, evidence ! where evidence is a list of (word, probability) pairs. ! """ ! ! from math import frexp, log as ln ! ! H = S = 1.0 ! Hexp = Sexp = 0 ! ! clues = self._getclues(wordstream) ! for prob, word, record in clues: ! if record is not None: # else wordinfo doesn't know about it ! record.killcount += 1 ! S *= 1.0 - prob ! H *= prob ! if S < 1e-200: # prevent underflow ! S, e = frexp(S) ! Sexp += e ! if H < 1e-200: # prevent underflow ! H, e = frexp(H) ! Hexp += e ! ! n = len(clues) ! if n: ! nrecip = 1.0 / n ! P = 1.0 - S**nrecip * 2.0**(Sexp * nrecip) ! Q = 1.0 - H**nrecip * 2.0**(Hexp * nrecip) ! ! S = ln(S) + Sexp * LN2 ! H = ln(H) + Hexp * LN2 ! S = 1.0 - chi2Q(-2.0 * S, 2*n) ! H = 1.0 - chi2Q(-2.0 * H, 2*n) ! ! else: ! P = Q = S = H = 1.0 ! ! gary_score = P/(P+Q) ! chi_score = (S-H + 1.0) / 2.0 ! ! w = options.mixed_combining_chi_weight ! prob = w * chi_score + (1.0 - w) * gary_score ! ! if evidence: ! clues = [(w, p) for p, w, r in clues] ! clues.sort(lambda a, b: cmp(a[1], b[1])) ! extra = [('*chi_score*', chi_score), ! ('*gary_score*', gary_score), ! ('*S*', S), ! ('*H*', H), ! ('*P*', P), ! ('*Q*', Q), ! ('*n*', n), ! ] ! clues[0:0] = extra ! return prob, clues ! else: ! return prob ! ! if options.use_mixed_combining: ! spamprob = mixed_spamprob def learn(self, wordstream, is_spam, update_probabilities=True): *************** *** 358,610 **** # Return (prob, word, record). return [t[1:] for t in clues] - - #************************************************************************ - # Some options change so much behavior that it's better to write a - # different method. - # CAUTION: These end up overwriting methods of the same name above. - # A subclass would be cleaner, but experiments will soon enough lead - # to only one of the alternatives surviving. - - def tim_spamprob(self, wordstream, evidence=False): - """Return best-guess probability that wordstream is spam. - - wordstream is an iterable object producing words. - The return value is a float in [0.0, 1.0]. - - If optional arg evidence is True, the return value is a pair - probability, evidence - where evidence is a list of (word, probability) pairs. - """ - - from math import frexp - - # The real H = this H times 2**Hexp. Likewise for S. We're - # simulating unbounded dynamic float range by hand. If this pans - # out, *maybe* we should store logarithms in the database instead - # and just add them here. But I like keeping raw counts in the - # database (they're easy to understand, manipulate and combine), - # and there's no evidence that this simulation is a significant - # expense. - # S is a spamminess measure, and is the geometric mean of the - # extreme-word spamprobs. - # H is a hamminess measure, and is the geometric mean of 1 - the - # extreme-word spamprobs. - H = S = 1.0 - Hexp = Sexp = 0 - clues = self._getclues(wordstream) - for prob, word, record in clues: - if record is not None: # else wordinfo doesn't know about it - record.killcount += 1 - S *= prob - H *= 1.0 - prob - if S < 1e-200: # move back into range - S, e = frexp(S) - Sexp += e - if H < 1e-200: # move back into range - H, e = frexp(H) - Hexp += e - - S, e = frexp(S) - Sexp += e - H, e = frexp(H) - Hexp += e - - num_clues = len(clues) - if num_clues: - # (x*2**e)**n = x**n * 2**(e*n). - n = 1.0 / num_clues - S = S**n * 2.0**(Sexp * n) - H = H**n * 2.0**(Hexp * n) - prob = S/(S+H) - else: - prob = 0.5 - - if evidence: - clues = [(w, p) for p, w, r in clues] - clues.sort(lambda a, b: cmp(a[1], b[1])) - return prob, clues - else: - return prob - - if options.use_tim_combining: - spamprob = tim_spamprob - - # Across vectors of length n, containing random uniformly-distributed - # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution - # with 2*n degrees of freedom. This has been proven (in some - # appropriate sense) to be the most sensitive possible test for - # rejecting the hypothesis that a vector of probabilities is uniformly - # distributed. Gary Robinson's original scheme was monotonic *with* - # this test, but skipped the details. Turns out that getting closer - # to the theoretical roots gives a much sharper classification, with - # a very small (in # of msgs), but also very broad (in range of scores), - # "middle ground", where most of the mistakes live. In particular, - # this scheme seems immune to all forms of "cancellation disease": if - # there are many strong ham *and* spam clues, this reliably scores - # close to 0.5. Most other schemes are extremely certain then -- and - # often wrong. - def chi2_spamprob(self, wordstream, evidence=False): - """Return best-guess probability that wordstream is spam. - - wordstream is an iterable object producing words. - The return value is a float in [0.0, 1.0]. - - If optional arg evidence is True, the return value is a pair - probability, evidence - where evidence is a list of (word, probability) pairs. - """ - - from math import frexp, log as ln - - # We compute two chi-squared statistics, one for ham and one for - # spam. The sum-of-the-logs business is more sensitive to probs - # near 0 than to probs near 1, so the spam measure uses 1-p (so - # that high-spamprob words have greatest effect), and the ham - # measure uses p directly (so that lo-spamprob words have greatest - # effect). - # - # For optimization, sum-of-logs == log-of-product, and f.p. - # multiplication is a lot cheaper than calling ln(). It's easy - # to underflow to 0.0, though, so we simulate unbounded dynamic - # range via frexp. The real product H = this H * 2**Hexp, and - # likewise the real product S = this S * 2**Sexp. - H = S = 1.0 - Hexp = Sexp = 0 - - clues = self._getclues(wordstream) - for prob, word, record in clues: - if record is not None: # else wordinfo doesn't know about it - record.killcount += 1 - S *= 1.0 - prob - H *= prob - if S < 1e-200: # prevent underflow - S, e = frexp(S) - Sexp += e - if H < 1e-200: # prevent underflow - H, e = frexp(H) - Hexp += e - - # Compute the natural log of the product = sum of the logs: - # ln(x * 2**i) = ln(x) + i * ln(2). - S = ln(S) + Sexp * LN2 - H = ln(H) + Hexp * LN2 - - n = len(clues) - if n: - S = 1.0 - chi2Q(-2.0 * S, 2*n) - H = 1.0 - chi2Q(-2.0 * H, 2*n) - - # How to combine these into a single spam score? We originally - # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A - # systematic problem is that we could end up being near-certain - # a thing was (for example) spam, even if S was small, provided - # that H was much smaller. - # Rob Hooft stared at these problems and invented the measure - # we use now, the simpler S-H, scaled into [0., 1.]. - prob = (S-H + 1.0) / 2.0 - else: - prob = 0.5 - - if evidence: - clues = [(w, p) for p, w, r in clues] - clues.sort(lambda a, b: cmp(a[1], b[1])) - clues.insert(0, ('*S*', S)) - clues.insert(0, ('*H*', H)) - return prob, clues - else: - return prob - - if options.use_chi_squared_combining: - spamprob = chi2_spamprob - - def mixed_spamprob(self, wordstream, evidence=False): - """Return best-guess probability that wordstream is spam. - - wordstream is an iterable object producing words. - The return value is a float in [0.0, 1.0]. - - If optional arg evidence is True, the return value is a pair - probability, evidence - where evidence is a list of (word, probability) pairs. - """ - - from math import frexp, log as ln - - # We compute two chi-squared statistics, one for ham and one for - # spam. The sum-of-the-logs business is more sensitive to probs - # near 0 than to probs near 1, so the spam measure uses 1-p (so - # that high-spamprob words have greatest effect), and the ham - # measure uses p directly (so that lo-spamprob words have greatest - # effect). - # - # For optimization, sum-of-logs == log-of-product, and f.p. - # multiplication is a lot cheaper than calling ln(). It's easy - # to underflow to 0.0, though, so we simulate unbounded dynamic - # range via frexp. The real product H = this H * 2**Hexp, and - # likewise the real product S = this S * 2**Sexp. - H = S = 1.0 - Hexp = Sexp = 0 - - clues = self._getclues(wordstream) - for prob, word, record in clues: - if record is not None: # else wordinfo doesn't know about it - record.killcount += 1 - S *= 1.0 - prob - H *= prob - if S < 1e-200: # prevent underflow - S, e = frexp(S) - Sexp += e - if H < 1e-200: # prevent underflow - H, e = frexp(H) - Hexp += e - - n = len(clues) - if n: - #P = 1.0 - P**(1./num_clues) - #Q = 1.0 - Q**(1./num_clues) - # - # (x*2**e)**n = x**n * 2**(e*n) - nrecip = 1.0 / n - P = 1.0 - S**nrecip * 2.0**(Sexp * nrecip) - Q = 1.0 - H**nrecip * 2.0**(Hexp * nrecip) - - # Compute the natural log of the product = sum of the logs: - # ln(x * 2**i) = ln(x) + i * ln(2). - S = ln(S) + Sexp * LN2 - H = ln(H) + Hexp * LN2 - - S = 1.0 - chi2Q(-2.0 * S, 2*n) - H = 1.0 - chi2Q(-2.0 * H, 2*n) - - else: - P = Q = S = H = 1.0 - - gary_score = P/(P+Q) - - # How to combine these into a single spam score? We originally - # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A - # systematic problem is that we could end up being near-certain - # a thing was (for example) spam, even if S was small, provided - # that H was much smaller. - # Rob Hooft stared at these problems and invented the measure - # we use now, the simpler S-H, scaled into [0., 1.]. - chi_score = (S-H + 1.0) / 2.0 - - x = options.mixed_combining_chi_weight - prob = x * chi_score + (1.0 - x) * gary_score - - if evidence: - clues = [(w, p) for p, w, r in clues] - clues.sort(lambda a, b: cmp(a[1], b[1])) - clues.insert(0, ('*P*', P)) - clues.insert(0, ('*Q*', Q)) - clues.insert(0, ('*S*', S)) - clues.insert(0, ('*H*', H)) - clues.insert(0, ('*chi_score*', chi_score)) - clues.insert(0, ('*gary_score*', gary_score)) - return prob, clues - else: - return prob - - if options.use_mixed_combining: - spamprob = mixed_spamprob --- 520,521 ---- From mhammond@users.sourceforge.net Sat Oct 19 06:36:57 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Fri, 18 Oct 2002 22:36:57 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs - New directory Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv8855/dialogs Log Message: Directory /cvsroot/spambayes/spambayes/Outlook2000/dialogs added to the repository From mhammond@users.sourceforge.net Sat Oct 19 17:23:39 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sat, 19 Oct 2002 09:23:39 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 classify.py,NONE,1.1 manager.py,NONE,1.1 rule.py,NONE,1.1 filter.py,1.1,1.2 train.py,1.1,1.2README.txt,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv19979 Modified Files: filter.py train.py README.txt Added Files: classify.py manager.py rule.py Log Message: New version of the outlook addin code with a fancy GUI. --- NEW FILE: classify.py --- # Train a classifier from Outlook Mail folders # Author: Sean D. True, WebReply.Com # October, 2002 # Copyright PSF, license under the PSF license import sys, os, os.path, getopt, cPickle, string from win32com.client import Dispatch, constants import pythoncom import win32con import classifier from tokenizer import tokenize from hammie import createbayes, Hammie def classify_folder( f, mgr, config, progress): hammie = Hammie(mgr.bayes) messages = f.Messages pythoncom.CoInitialize() # We are called on a different thread. # We must get outlook in this thread - can't use the main thread :( outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI") if not messages: progress.warning("Can't find messages in folder '%s'" % (f.Name,)) return message = messages.GetFirst() while not progress.stop_requested() and message: try: progress.tick() headers = message.Fields[0x7D001E].Value headers = headers.encode('ascii', 'replace') body = message.Text.encode('ascii', 'replace') text = headers + body prob, clues = hammie.score(text, evidence=1) added_prop = False try: if outlook_ns is not None: outlookItem = outlook_ns.GetItemFromID(message.ID) format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. prop = outlookItem.UserProperties.Add(config.field_name, constants.olNumber, True, format) prop.Value = prob outlookItem.Save() added_prop = True except "foo": # pythoncom.com_error, d: # Hrm - don't seem able to use outlook - use MAPI - but this # means the field doesn't automatically appear in the outlook "Field Chooser" # Tried explicity adding the field to the folder but still no go. added_prop = False if not added_prop: message.Fields.Add(config.field_name, 5, prob) message.Update() except pythoncom.com_error, d: progress.warning("Failed to get a message: %s" % (str(d),) ) message = messages.GetNext() # Called back from the dialog to do the actual training. def classifier(mgr, progress): session = mgr.mapi config = mgr.config.classify if not config.folder_ids: progress.error("You must specify at least one folder") return progress.set_status("Counting messages") folders = mgr.BuildFolderList(config.folder_ids, config.include_sub) num_msgs = 0 for f in folders: num_msgs += f.Messages.Count + 1 progress.set_max_ticks(num_msgs+3) for f in folders: progress.set_status("Processing folder '%s'" % (f.Name.encode("ascii", "replace"),)) classify_folder(f, mgr, config, progress) if progress.stop_requested(): return def main(): import manager mgr = manager.GetManager() import dialogs.ClassifyDialog d = dialogs.ClassifyDialog.ClassifyDialog(mgr, classifier) d.DoModal() mgr.Save() mgr.Close() if __name__ == "__main__": main() --- NEW FILE: manager.py --- import cPickle import os import sys import thread import classifier from tokenizer import tokenize import win32com.client import win32com.client.gencache import pythoncom # Suck in CDO type lib win32com.client.gencache.EnsureModule('{3FA7DEA7-6438-101B-ACC1-00AA00423326}', 0, 1, 21, bForDemand = True) try: this_filename = __file__ except NameError: this_filename = sys.argv[0] class ManagerError(Exception): pass class BayesManager: def __init__(self, config_base = "default", outlook = None, verbose = 1): self.verbose = verbose if not os.path.isabs(config_base): config_base = os.path.join( os.path.dirname(this_filename), config_base) config_base = os.path.abspath(config_base) self.ini_filename = config_base + "_bayes_customize.ini" self.bayes_filename = config_base + "_bayes_database.pck" self.config_filename = config_base + "_configuration.pck" # First read the configuration file. path = os.path.split(this_filename)[0] self.config = self.LoadConfig() cwd = os.getcwd() self.mapi = win32com.client.Dispatch("MAPI.Session") self.mapi.Logon(None, None, False, False) self._tls = {thread.get_ident(): {"outlook": outlook} } self.outlook = outlook os.chdir(cwd) self.LoadBayes() # Outlook gives us thread grief :( def WorkerThreadStarting(self): pythoncom.CoInitialize() self._tls[thread.get_ident()] = {} def WorkerThreadEnding(self): assert self._tls.has_key(thread.get_ident()), "WorkerThreadStarting hasn't been called for this thread" del self._tls[thread.get_ident()] pythoncom.CoUninitialize() def GetOutlookForCurrentThread(self): assert self._tls.has_key(thread.get_ident()), "WorkerThreadStarting hasn't been called for this thread" existing = self._tls[thread.get_ident()].get("outlook") if not existing: existing = win32com.client.Dispatch("Outlook.Application") self._tls[thread.get_ident()]["outlook"] = existing return existing def LoadBayes(self): if not os.path.exists(self.ini_filename): raise ManagerError("The file '%s' must exist before the database '%s' can be opened or created" % (self.ini_filename, self.bayes_filename)) bayes = None try: bayes = cPickle.load(open(self.bayes_filename,'rb')) print "Loaded bayes database from '%s'" % (self.bayes_filename,) except IOError: pass # ignore file-not-found except: print "Failed to load bayes database" import traceback traceback.print_exc() if bayes is None: self.InitNewBayes() bayes = self.bayes if self.verbose: print "Bayes database initialized with %d spam and %d good messages" % (bayes.nspam, bayes.nham) self.bayes = bayes self.bayes_dirty = False def LoadConfig(self): try: ret = cPickle.load(open(self.config_filename,'rb')) if self.verbose > 1: print "Loaded configuration from '%s':" % (self.config_filename,) ret._dump() except (AttributeError, ImportError): ret = _ConfigurationRoot() if self.verbose > 1: print "FAILED to load configuration from '%s - using default:" % (self.config_filename,) import traceback traceback.print_exc() return ret def InitNewBayes(self): os.environ["BAYESCUSTOMIZE"]=self.ini_filename self.bayes = classifier.Bayes() self.bayes_dirty = True def SaveBayes(self): bayes = self.bayes if self.verbose: print "Saving bayes database with %d spam and %d good messages" % (bayes.nspam, bayes.nham) print " ->", self.bayes_filename cPickle.dump(bayes, open(self.bayes_filename,"wb"), 1) def SaveConfig(self): if self.verbose > 1: print "Saving configuration:" self.config._dump() print " ->", self.config_filename cPickle.dump(self.config, open(self.config_filename,"wb"), 1) def Save(self): self.SaveConfig() if self.bayes_dirty: self.SaveBayes() self.bayes_dirty = False else: print "Bayes database is not dirty - not writing" def Close(self): if self.mapi is not None: self.mapi.Logoff() self.mapi = None if self.bayes_dirty and self.bayes: print "Warning: BayesManager closed while Bayes database dirty" self.bayes = None self.config = None self._tls = None def BuildFolderList(self, folder_ids, include_sub): ret = {} for id in folder_ids: subs = [] try: f = self.mapi.GetFolder(id) if include_sub: sub_ids = [] subs = f.Folders for i in range(1, subs.Count): sub_ids.append(subs.Item(i).ID) subs = self.BuildFolderList(sub_ids, True) except pythoncom.error: continue ret[id] = f for sub in subs: ret[sub.ID] = sub return ret.values() def YieldMessageList(self, folder): messages = folder.Messages if not messages: print "Can't find messages in folder '%s'" % (folder.Name,) return message = messages.GetFirst() while message is not None: yield message message = messages.GetNext() # configuration stuff we persist. class _ConfigurationContainer: def __init__(self, **kw): self.__dict__.update(kw) def __setstate__(self, state): self.__init__() # ensure any new/default values setup self.__dict__.update(state) def _dump(self, thisname="", level=0): import pprint prefix = " " * level print "%s%s:" % (prefix, thisname) for name, ob in self.__dict__.items(): d = getattr(ob, "_dump", None) if d is None: print "%s %s: %s" % (prefix, name, pprint.pformat(ob)) else: d(name, level+1) class _ConfigurationRoot(_ConfigurationContainer): def __init__(self): self.training = _ConfigurationContainer( ham_folder_ids = [], ham_include_sub = False, spam_folder_ids = [], spam_include_sub = False, ) self.classify = _ConfigurationContainer( folder_ids = [], include_sub = False, field_name = "SpamProb", ) self.filter = _ConfigurationContainer( folder_ids = [], include_sub = False, ) self.filter_now = _ConfigurationContainer( folder_ids = [], include_sub = False, only_unread = False, ) self.rules = [] _mgr = None def GetManager(): global _mgr if _mgr is None: _mgr = BayesManager() return _mgr if __name__=='__main__': try: mgr = BayesManager() except ManagerError, d: print "Error initializing Bayes manager" print d --- NEW FILE: rule.py --- import pythoncom from win32com.client import constants import time class Rule: def __init__(self): self.name = "New Rule" self.enabled = True self.min = 0.0 self.max = 0.9 self.action = "None" self.flag_message = True self.write_field = True self.write_field_name = "SpamProb" self.folder_id = "" def __repr__(self): bits = ["Rule at 0x%x:\n" % (id(self),)] for name, ob in self.__dict__.items(): bits.append(" rule.%s: %r\n" % (name, ob)) return "".join(bits) def GetProblem(self, mgr): if self.min > self.max: return "The maximum value must be greater than the minimum" if self.action != "None": if not self.folder_id: return "You must specify a folder for 'Move' or 'Copy'" if self._GetFolder(mgr) is None: return "Can not locate the destination folder" if self.write_field and not self.write_field_name: return "You must specify the field name to create" def _GetFolder(self, mgr): try: return mgr.mapi.GetFolder(self.folder_id) except pythoncom.com_error: return None def Act(self, mgr, msg, prob): if mgr.verbose > 1: print "Rule '%s': %.2f->%.2f (%.2f) (%s)" % (self.name, self.min, self.max, prob, msg.Subject[:20].encode("ascii", "replace")) if prob < self.min or prob > self.max: return False # Do mods before we move. outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI") outlook_message = outlook_ns.GetItemFromID(msg.ID) if self.flag_message: outlook_message.FlagRequest = "Check Spam" outlook_message.FlagStatus = constants.olFlagMarked outlook_message.Save() if self.write_field: format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format) prop.Value = prob outlook_message.Save() if self.action == "None": pass elif self.action == "Copy": outlook_message.Copy(outlook_ns.GetFolderFromID(self.folder_id)) elif self.action == "Move": print "moving", self.flag_message outlook_message.Move(outlook_ns.GetFolderFromID(self.folder_id)) else: print "Eeek - bad action", self.action return True Index: filter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** filter.py 4 Oct 2002 22:28:47 -0000 1.1 --- filter.py 19 Oct 2002 16:23:37 -0000 1.2 *************** *** 4,123 **** # Copyright PSF, license under the PSF license ! # Make py2exe happy ! import dbhash, anydbm ! ! import sys, os, os.path, cPickle, string, getopt ! import win32com.client ! ! import email ! import email.Parser ! from hammie import createbayes, Hammie ! import classifier ! ! ! def findFolder(f, findName, name=""): ! folders = f.Folders ! folder = folders.GetFirst() ! while folder: ! nm = "%s/%s" % (name, folder.Name) ! nm = nm.encode('ascii', 'replace') ! if nm == findName: ! return folder ! try: ! f = findFolder(folder, findName, nm) ! if f: ! return f ! except: ! pass ! folder = folders.GetNext() ! return None ! from tokenizer import tokenize ! def filter(bayes, rootFolder, folderName, targetName=None, over=None, ! under=None, detail=None): ! hammie = Hammie(bayes) ! n = nover = nunder = 0 ! f = findFolder(rootFolder, folderName) ! targetf = None ! if targetName: ! targetf = findFolder(rootFolder, targetName) ! if not targetf: ! print "Can't find folder %s to move messages to" % targetName ! return ! messages = f.Messages ! message = messages.GetFirst() ! while message: try: ! headers = "%s" % message.fields[0x7D001E] headers = headers.encode('ascii', 'replace') body = message.Text.encode('ascii', 'replace') ! n = n + 1 ! except: ! message = messages.GetNext() continue ! text = headers + body prob, clues = hammie.score(text, evidence=1) ! if over <> None and prob >= over: ! nover = nover + 1 ! if detail: ! print "***Over threshold", prob, over ! for i in range(1, message.recipients.Count+1): ! print message.Recipients[i].Address, ! print message.Subject.encode('ascii','replace') ! print hammie.formatclues(clues) ! if targetf: ! message.MoveTo(targetf.ID) ! if under <> None and prob <= under: ! nunder = nunder + 1 ! if detail: ! print "***Under threshold", prob, under ! for i in range(1, message.recipients.Count+1): ! print message.Recipients[i].Address, ! print message.Subject.encode('ascii','replace') ! print hammie.formatclues(clues) ! if targetf: ! message.MoveTo(targetf.ID) ! message = messages.GetNext() ! print "Total %d, over %d under %d" % (n, nover, nunder) ! def usage(): ! print "Usage: filter.py --bayes=bayes.pck --from=folder,folder,folder [--to=folder] [--detail] [--over=float|--under=float]" ! print """Example: python filter.py --from=/Personal/Hotmail,/Personal/ExJunk ! --over=.35 --detail --to=/SpamMaybe""" def main(): ! from hammie import createbayes ! db_name = 'bayes.pck' ! folders = [] ! options = ["over=", "under=", "bayes=", "to=", "from=", "detail"] ! dodetail=targetName=to=over=under= None ! opts,args = getopt.getopt(sys.argv[1:], None, options) ! if args: ! usage() ! sys.exit(1) ! for opt, arg in opts: ! if opt == "--under": under = float(arg) ! elif opt == "--over": over = float(arg) ! elif opt == "--bayes": db_name = arg ! elif opt == "--to": targetName = arg ! elif opt == "--from": folders = string.split(arg, ",") ! elif opt == "--detail": dodetail = 1 ! if not (over or under) or not folders: ! usage() ! sys.exit(1) ! bayes = cPickle.load(open(db_name,'rb')) ! cwd = os.getcwd() ! session = win32com.client.Dispatch("MAPI.Session") ! session.Logon() ! personalFolders = findFolder(session.GetFolder(''), ! '/Top of Personal Folders') ! for folder in folders: ! print "Filtering %s, over: %s under %s" % (arg, over, under) ! filter(bayes, personalFolders, folder, targetName, over=over, ! under=under, detail=dodetail) ! session.Logoff() ! session = None ! print 'Done' if __name__ == "__main__": --- 4,78 ---- # Copyright PSF, license under the PSF license ! import sys, os ! from win32com.client import Dispatch, constants ! import pythoncom ! import rule + from hammie import Hammie ! def filter_folder(f, mgr, progress, filter): ! only_unread = filter.only_unread ! hammie = Hammie(mgr.bayes) ! num_messages = 0 ! for message in mgr.YieldMessageList(f): ! if progress.stop_requested(): ! break ! progress.tick() ! if only_unread and not message.Unread: ! continue ! try: ! headers = message.Fields[0x7D001E].Value headers = headers.encode('ascii', 'replace') body = message.Text.encode('ascii', 'replace') ! text = headers + body ! except pythoncom.com_error, d: ! progress.warning("Failed to get a message: %s" % (str(d),) ) continue ! prob, clues = hammie.score(text, evidence=1) ! did_this_message = False ! for rule in mgr.config.rules: ! if rule.enabled: ! try: ! if rule.Act(mgr, message, prob): ! did_this_message = True ! except: ! print "Rule failed!" ! import traceback ! traceback.print_exc() ! if did_this_message: ! num_messages += 1 ! return num_messages ! ! def filterer(mgr, progress, filter): ! if not filter.folder_ids: ! progress.error("You must specify at least one folder") ! return ! ! progress.set_status("Counting messages") ! folders = mgr.BuildFolderList(filter.folder_ids, filter.include_sub) ! num_msgs = 0 ! for f in folders: ! num_msgs += f.Messages.Count + 1 ! progress.set_max_ticks(num_msgs+3) ! num = 0 ! for f in folders: ! progress.set_status("Filtering folder '%s'" % (f.Name.encode("ascii", "replace"),)) ! num += filter_folder(f, mgr, progress, filter) ! if progress.stop_requested(): ! return ! progress.set_status("Filter acted upon %d messages" % (num,)) def main(): ! import manager ! mgr = manager.GetManager() ! ! import dialogs.FilterDialog ! d = dialogs.FilterDialog.FilterArrivalsDialog(mgr, rule.Rule, filterer) ! d.DoModal() ! mgr.Save() ! mgr.Close() if __name__ == "__main__": Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** train.py 4 Oct 2002 22:28:47 -0000 1.1 --- train.py 19 Oct 2002 16:23:37 -0000 1.2 *************** *** 6,92 **** import sys, os, os.path, getopt, cPickle, string import win32com.client import classifier from tokenizer import tokenize ! def findFolder(f,findName, name=""): ! folders = f.Folders ! folder = folders.GetFirst() ! while folder: ! nm = "%s/%s" % (name, folder.Name) ! nm = nm.encode('ascii','replace') ! if nm == findName: ! return folder ! try: ! f = findFolder(folder, findName, nm) ! if f: return f ! except: ! pass ! folder = folders.GetNext() ! return None ! ! def train( bayes, rootFolder,folderName, isspam): ! f = findFolder(rootFolder, folderName) ! if not f: ! print "Can't find folder", folderName ! return ! messages = f.Messages ! if not messages: ! print "Can't find messages in folder", folderName ! return ! message = messages.GetFirst() ! while message: try: ! headers = "%s" % message.fields[0x7D001E] headers = headers.encode('ascii', 'replace') body = message.Text.encode('ascii', 'replace') ! text = headers + body ! bayes.learn(tokenize(text), isspam, False) ! except: ! pass ! message = messages.GetNext() ! def usage(): ! print "Usage: train.py --bayes=bayes.pck --spam=folder,folder,folder --ham=folder,folder,folder" ! print """Example: python train.py --bayes=bayes.pck --spam=/JunkMail,/Personal/Hotmail,/Personal/Spam --ham="/Dragon People,/WebReply,/House,/Tenberry,/Receipts and coupons,/Rational and MIT,/Lists/List-mod_python,/Lists/List-other,/List-Webware,/Microsoft,/Fishing,/Ebusiness,/Amazon" """ ! def main(): ! db_name = 'bayes.pck' ! spam = [] ! ham = [] ! options = ["ham=", "spam=", "bayes="] ! opts,args = getopt.getopt(sys.argv[1:], None, options) ! if args: ! usage() ! sys.exit(1) ! for opt,arg in opts: ! if opt == "--spam": spam = string.split(arg, ',') ! elif opt == "--ham": ham = string.split(arg,',') ! elif opt == "--bayes": db_name = arg ! if not spam and not ham: ! usage() ! sys.exit(1) ! cwd = os.getcwd() ! session = win32com.client.Dispatch("MAPI.Session") ! session.Logon() ! personalFolders = findFolder(session.GetFolder(''), ! '/Top of Personal Folders') ! bayes = classifier.Bayes() ! for folder in spam: ! print "Training with %s as spam" % folder ! train(bayes, personalFolders,folder, 1) ! for folder in ham: ! print "Training with %s as ham" % folder ! train(bayes, personalFolders,folder, 0) ! session.Logoff() ! session = None ! print 'Updating probabilities...' bayes.update_probabilities() ! print ("Done with training %s, built with %d examples and %d counter " ! "examples" % (db_name, bayes.nspam, bayes.nham)) ! db_name = os.path.join(cwd, db_name) ! print 'Writing DB...' ! cPickle.dump(bayes, open(db_name,"wb"), 1) if __name__ == "__main__": --- 6,82 ---- import sys, os, os.path, getopt, cPickle, string import win32com.client + import pythoncom + import win32con + import classifier from tokenizer import tokenize ! def train_folder( f, isspam, mgr, progress): ! for message in mgr.YieldMessageList(f): ! if progress.stop_requested(): ! break ! progress.tick() try: ! # work with MAPI until we work out how to get headers from outlook ! message = mgr.mapi.GetMessage(message.ID) ! headers = message.Fields[0x7D001E].Value headers = headers.encode('ascii', 'replace') body = message.Text.encode('ascii', 'replace') ! except pythoncom.com_error: ! progress.warning("failed to get a message") ! continue ! text = headers + body ! mgr.bayes.learn(tokenize(text), isspam, False) ! # Called back from the dialog to do the actual training. ! def trainer(mgr, progress): ! pythoncom.CoInitialize() ! config = mgr.config ! mgr.InitNewBayes() ! bayes = mgr.bayes ! session = mgr.mapi ! ! if not config.training.ham_folder_ids or not config.training.spam_folder_ids: ! progress.error("You must specify at least one spam, and one good folder") ! return ! progress.set_status("Counting messages") ! ham_folders = mgr.BuildFolderList(config.training.ham_folder_ids, config.training.ham_include_sub) ! spam_folders = mgr.BuildFolderList(config.training.spam_folder_ids, config.training.ham_include_sub) ! num_msgs = 0 ! for f in ham_folders + spam_folders: ! num_msgs += f.Messages.Count + 1 ! progress.set_max_ticks(num_msgs+3) + for f in ham_folders: + progress.set_status("Processing good folder '%s'" % (f.Name.encode("ascii", "replace"),)) + train_folder(f, 0, mgr, progress) + if progress.stop_requested(): + return + for f in spam_folders: + progress.set_status("Processing spam folder '%s'" % (f.Name.encode("ascii", "replace"),)) + train_folder(f, 1, mgr, progress) + if progress.stop_requested(): + return ! progress.tick() ! progress.set_status('Updating probabilities...') bayes.update_probabilities() ! progress.tick() ! if progress.stop_requested(): ! return ! mgr.bayes_dirty = True ! progress.set_status("Completed training with %d spam and %d good messages" % (bayes.nspam, bayes.nham)) ! ! def main(): ! import manager ! mgr = manager.GetManager() ! ! import dialogs.TrainingDialog ! d = dialogs.TrainingDialog.TrainingDialog(mgr, trainer) ! d.DoModal() ! ! mgr.Save() ! mgr.Close() if __name__ == "__main__": Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/README.txt,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** README.txt 4 Oct 2002 22:28:47 -0000 1.1 --- README.txt 19 Oct 2002 16:23:37 -0000 1.2 *************** *** 1,5 **** This directory contains tools for using the classifier with Microsoft ! Outlook 2000, courtesy of Sean True. Note that you need Python's win32com ! extensions. train.py --- 1,10 ---- This directory contains tools for using the classifier with Microsoft ! Outlook 2000, courtesy of Sean True and Mark Hammond. Note that you need ! Python's win32com extensions (http://starship.python.net/crew/mhammond) ! ! ** NOTE ** - You also need CDO installed. This comes with Outlook 2k, but is ! not installed by default. You may need to find your Office 2000 CD, select ! Add/Remove components, and find CDO under Outlook. If you see a COM error ! compaining about "MAPI.Session", this is your problem. train.py *************** *** 7,17 **** filter.py ! Moves msgs among Outlook Mail folders, based on classifier score. ! ! spam.py ! Dump Outlook Mail folders into the spam reservoir. ! ! Comments from Sean: --- 12,24 ---- filter.py ! Moves and modifies msgs among Outlook Mail folders, based on classifier ! score. + classify.py + Creates a field in each message with the classifier score. Once run, + the Outlook Field Chooser can be used to display, sort etc the field, + or used to change formatting of these messages. The field will appear + in "user defined fields" + Comments from Sean: *************** *** 24,58 **** filter. Closing and reopening Outlook always seems to restore things, with no fuss. Your mileage may vary. Buyer beware. Worth what you paid. ! ! Brad Morgan comments that in an environment with multiple InfoStores ! (message stores?), my simple folder finder does not work. He uses this ! work around: ! ! =============== ! # This didn't work: ! # personalFolders = findFolder(folder, 'Personal Folders') ! # ! # The following was required: ! # (Note: I have two infostores and I've hard-coded the index of ! # 'Personal Folders') ! ! infostores = session.InfoStores ! print "There are %d infostores" % infostores.Count ! infostore = infostores[1] ! print "Infostore = ", infostore.Name ! personalFolders = infostore.RootFolder ! ================= ! ! It deserves an option to select the infostore wanted by name. ! ! Enjoy. Copyright transferred to PSF from Sean D. True and WebReply.com. Licensed under PSF, see Tim Peters for IANAL interpretation. Ask me technical questions, and if your mail doesn't get eaten by a broken spam filter, I'll try to help. - - -- Sean seant@iname.com --- 31,48 ---- filter. Closing and reopening Outlook always seems to restore things, with no fuss. Your mileage may vary. Buyer beware. Worth what you paid. ! (Mark hasn't seen this) Copyright transferred to PSF from Sean D. True and WebReply.com. Licensed under PSF, see Tim Peters for IANAL interpretation. + Copyright transferred to PSF from Mark Hammond. + Licensed under PSF, see Tim Peters for IANAL interpretation. + Ask me technical questions, and if your mail doesn't get eaten by a broken spam filter, I'll try to help. -- Sean seant@iname.com + + Ask Sean all the technical questions + -- Mark + mhammond@skippinet.com.au From mhammond@users.sourceforge.net Sat Oct 19 17:24:15 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sat, 19 Oct 2002 09:24:15 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs AsyncDialog.py,NONE,1.1 ClassifyDialog.py,NONE,1.1 DialogGlobals.py,NONE,1.1 FilterDialog.py,NONE,1.1 FolderSelector.py,NONE,1.1 RuleDialog.py,NONE,1.1 TrainingDialog.py,NONE,1.1__init__.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv20352 Added Files: AsyncDialog.py ClassifyDialog.py DialogGlobals.py FilterDialog.py FolderSelector.py RuleDialog.py TrainingDialog.py __init__.py Log Message: First cut of the GUI itself. --- NEW FILE: AsyncDialog.py --- # Base class for an "async" dialog. from pywin.mfc import dialog import win32con import commctrl import win32ui import win32api IDC_START = 1100 IDC_PROGRESS = 1101 IDC_PROGRESS_TEXT = 1102 def MAKELPARAM(low, high): return ((0x0000FFFF & high) << 16) | (0x0000FFFF & low) MYWM_SETSTATUS = win32con.WM_USER+11 MYWM_SETWARNING = win32con.WM_USER+12 MYWM_SETERROR = win32con.WM_USER+13 MYWM_FINISHED = win32con.WM_USER+14 # This is called from another thread - hence we need to jump through hoops! class _Progress: def __init__(self, dlg): self.hprogress = dlg.GetDlgItem(IDC_PROGRESS).GetSafeHwnd() self.hdlg = dlg.GetSafeHwnd() self.dlg = dlg self.stopping = False def set_max_ticks(self, m): win32api.PostMessage(self.hprogress, commctrl.PBM_SETRANGE, 0, MAKELPARAM(0,m)) win32api.PostMessage(self.hprogress, commctrl.PBM_SETSTEP, 1, 0) win32api.PostMessage(self.hprogress, commctrl.PBM_SETPOS, 0, 0) def tick(self): win32api.PostMessage(self.hprogress, commctrl.PBM_STEPIT, 0, 0) #self.p.StepIt() def set_status(self, text): self.dlg.progress_status = text win32api.PostMessage(self.hdlg, MYWM_SETSTATUS) def warning(self, text): self.dlg.progress_warning = text win32api.PostMessage(self.hdlg, MYWM_SETWARNING) def error(self, text): self.dlg.progress_error = text win32api.PostMessage(self.hdlg, MYWM_SETERROR) def request_stop(self): self.stopping = True def stop_requested(self): return self.stopping class AsyncDialogBase(dialog.Dialog): def __init__ (self, dt): dialog.Dialog.__init__ (self, dt) self.progress_status = "" self.progress_error = "" self.progress_warning = "" self.running = False def OnInitDialog(self): self.GetDlgItem(IDC_PROGRESS).ShowWindow(win32con.SW_HIDE) self.HookMessage(self.OnProgressStatus, MYWM_SETSTATUS) self.HookMessage(self.OnProgressError, MYWM_SETERROR) self.HookMessage(self.OnProgressWarning, MYWM_SETWARNING) self.HookMessage(self.OnFinished, MYWM_FINISHED) self.HookCommand(self.OnStart, IDC_START) return dialog.Dialog.OnInitDialog (self) def OnFinished(self, msg): self.seen_finished = True wasCancelled = msg[2] for id in self.disable_while_running_ids: self.GetDlgItem(id).EnableWindow(1) self.SetDlgItemText(IDC_START, self.process_start_text) self.GetDlgItem(IDC_PROGRESS).ShowWindow(win32con.SW_HIDE) if wasCancelled: self.SetDlgItemText(IDC_PROGRESS_TEXT, "Cancelled") def OnProgressStatus(self, msg): self.SetDlgItemText(IDC_PROGRESS_TEXT, self.progress_status) def OnProgressError(self, msg): self.SetDlgItemText(IDC_PROGRESS_TEXT, self.progress_error) self.MessageBox(self.progress_error) if not self.running and not self.seen_finished: self.OnFinished( (0,0,0) ) def OnProgressWarning(self, msg): pass def OnStart(self, id, code): if id == IDC_START: self.StartProcess() def StartProcess(self): if self.running: self.progress.request_stop() else: for id in self.disable_while_running_ids: self.GetDlgItem(id).EnableWindow(0) self.SetDlgItemText(IDC_START, self.process_stop_text) self.SetDlgItemText(IDC_PROGRESS_TEXT, "") self.GetDlgItem(IDC_PROGRESS).ShowWindow(win32con.SW_SHOW) # Local function for the thread target that notifies us when finished. def thread_target(h, progress): try: self.progress = progress self.seen_finished = False self.running = True self._DoProcess() finally: win32api.PostMessage(h, MYWM_FINISHED, self.progress.stop_requested()) self.running = False self.progress = None # back to the program :) import threading t = threading.Thread(target=thread_target, args =(self.GetSafeHwnd(), _Progress(self))) t.start() --- NEW FILE: ClassifyDialog.py --- from pywin.mfc import dialog import win32con import commctrl import win32ui import win32api #these are the atom numbers defined by Windows for basic dialog controls BUTTON = 0x80 EDIT = 0x81 STATIC = 0x82 LISTBOX = 0x83 SCROLLBAR = 0x84 COMBOBOX = 0x85 IDC_STATIC_FOLDERS = 1001 IDC_BROWSE = 1002 IDC_FIELDNAME = 1003 from AsyncDialog import IDC_START, IDC_PROGRESS, IDC_PROGRESS_TEXT, AsyncDialogBase class ClassifyDialog(AsyncDialogBase): style = win32con.DS_MODALFRAME | win32con.WS_POPUP | win32con.WS_VISIBLE | win32con.WS_CAPTION | win32con.WS_SYSMENU | win32con.DS_SETFONT cs = win32con.WS_CHILD | win32con.WS_VISIBLE info_text = "For every message in the selected folders, a new field will be created with the spam rating. The Outlook 'Field Chooser' can show the field" classify_text = "Classify messages in the following folder" process_start_text = "&Classify now" process_stop_text = "Stop &classification" dt = [ # Dialog itself. ["Classification", (0, 0, 241, 130), style, None, (8, "MS Sans Serif")], # Children [STATIC, info_text, -1, ( 7, 6, 227, 16), cs ], [STATIC, classify_text, -1, ( 7, 29, 131, 11), cs ], [STATIC, "", IDC_STATIC_FOLDERS, ( 7, 40, 167, 12), cs | win32con.SS_SUNKEN | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE], [BUTTON, '&Browse', IDC_BROWSE, (184, 40, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [STATIC, "Field name to create",-1, ( 7, 60, 67, 11), cs], [EDIT, "", IDC_FIELDNAME, ( 80, 57, 93, 14), cs | win32con.WS_BORDER | win32con.ES_AUTOHSCROLL], [BUTTON, process_start_text, IDC_START, ( 7, 109, 70, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], ["msctls_progress32", '', IDC_PROGRESS, ( 7, 80, 166, 11), cs | win32con.WS_BORDER], [STATIC, '', IDC_PROGRESS_TEXT, ( 7, 96, 227, 10), cs ], [BUTTON, 'Close', win32con.IDOK, (184, 109, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ] disable_while_running_ids = [IDC_FIELDNAME, IDC_BROWSE, win32con.IDOK] def __init__ (self, mgr, classifier): self.classifier = classifier self.config = mgr.config.classify self.mapi = mgr.mapi self.mgr = mgr AsyncDialogBase.__init__ (self, self.dt) def OnInitDialog(self): self.HookCommand(self.OnBrowse, IDC_BROWSE) self.SetDlgItemText(IDC_FIELDNAME, self.config.field_name) self.UpdateStatus() return AsyncDialogBase.OnInitDialog (self) def UpdateStatus(self): names = [] for eid in self.config.folder_ids: try: name = self.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" names.append(name) self.SetDlgItemText(IDC_STATIC_FOLDERS, "; ".join(names)) def OnBrowse(self, id, code): if code == win32con.BN_CLICKED: import FolderSelector l = self.config.folder_ids d = FolderSelector.FolderSelector(self.mapi, l,checkbox_state=self.config.include_sub) if d.DoModal()==win32con.IDOK: l[:], self.config.include_sub = d.GetSelectedIDs()[:] self.UpdateStatus() def _DoProcess(self): fieldName = self.GetDlgItemText(IDC_FIELDNAME) if not fieldName: self.progress.error("You must specify a field name") return self.config.field_name = fieldName self.mgr.WorkerThreadStarting() try: self.classifier(self.mgr, self.progress) finally: self.mgr.WorkerThreadEnding() --- NEW FILE: DialogGlobals.py --- # General helpers for out dialogs def INDEXTOSTATEIMAGEMASK(i): # from new commctrl.h return i << 12 IIL_UNCHECKED = 1 IIL_CHECKED = 2 #these are the atom numbers defined by Windows for basic dialog controls BUTTON = 0x80 EDIT = 0x81 STATIC = 0x82 LISTBOX = 0x83 SCROLLBAR = 0x84 COMBOBOX = 0x85 --- NEW FILE: FilterDialog.py --- from __future__ import generators import copy from pywin.mfc import dialog import win32con import commctrl import win32ui import win32api import pythoncom from DialogGlobals import * import RuleDialog class RuleList: def __init__(self, parent, idc, rules, rule_factory, idc_add = None, idc_remove = None, idc_edit = None): self.parent = parent self.list = parent.GetDlgItem(idc) self.rules = rules self.rule_factory = rule_factory bitmapID = win32ui.IDB_HIERFOLDERS bitmapMask = win32api.RGB(0,0,255) self.imageList = win32ui.CreateImageList(bitmapID, 16, 0, bitmapMask) self.list.SetImageList(self.imageList, commctrl.LVSIL_NORMAL) parent.HookNotify(self.OnTreeItemSelChanged, commctrl.TVN_SELCHANGED) parent.HookNotify(self.OnTreeItemDoubleClick, commctrl.NM_DBLCLK) if idc_add is None: self.butAdd = None else: parent.HookCommand(self.OnButAdd, idc_add) self.butAdd = parent.GetDlgItem(idc_add) if idc_remove is None: self.butRemove = None else: parent.HookCommand(self.OnButRemove, idc_remove) self.butRemove = parent.GetDlgItem(idc_remove) if idc_edit is None: self.butEdit = None else: parent.HookCommand(self.OnButEdit, idc_edit) self.butEdit = parent.GetDlgItem(idc_edit) self.Refresh() def PushEnabledStates(self): self.pushed_state = {} for rule in self.rules: self.pushed_state[rule] = rule.enabled def PopEnabledStates(self): for rule in self.rules: old_state = self.pushed_state.get(rule) if old_state is not None: rule.enabled = old_state def Refresh(self, selIndex = None): if selIndex is None: selIndex = self.GetSelectedRuleIndex() self.SyncEnabledStates() self.list.DeleteAllItems() index = 0 for rule in self.rules: if rule.enabled: state = INDEXTOSTATEIMAGEMASK(IIL_CHECKED) else: state = INDEXTOSTATEIMAGEMASK(IIL_UNCHECKED) mask = commctrl.TVIS_STATEIMAGEMASK bitmapCol = bitmapSel = 5 hItem = self.list.InsertItem(commctrl.TVI_ROOT, 0, (None, state, mask, rule.name, bitmapCol, bitmapSel, 0, index)) if index == selIndex: self.list.SelectItem(hItem) index += 1 def _YieldItems(self): try: h = self.list.GetNextItem(commctrl.TVI_ROOT, commctrl.TVGN_CHILD) except win32ui.error: h = None index = 0 while h is not None: yield h, index, self.rules[index] index += 1 try: h = self.list.GetNextItem(h, commctrl.TVGN_NEXT) except win32ui.error: h = None # No reliable way to get notified of checkbox state - so # when we need to know, this will set rule.enabled to the # current state of the checkbox. def SyncEnabledStates(self): mask = INDEXTOSTATEIMAGEMASK(IIL_UNCHECKED) | INDEXTOSTATEIMAGEMASK(IIL_CHECKED) for h, index, rule in self._YieldItems(): state = self.list.GetItemState(h, mask) checked = (state >> 12) - 1 rule.enabled = checked def GetSelectedRuleIndex(self): try: hitem = self.list.GetSelectedItem() except win32ui.error: return None for h, index, rule in self._YieldItems(): if hitem == h: return index def OnTreeItemSelChanged(self,(hwndFrom, idFrom, code), extra): #if idFrom != IDC_LIST_FOLDERS: return None action, itemOld, itemNew, pt = extra if self.butRemove is not None: self.butRemove.EnableWindow(itemNew != 0) if self.butEdit is not None: self.butEdit.EnableWindow(itemNew != 0) return 1 def OnTreeItemDoubleClick(self,(hwndFrom, idFrom, code), extra): if self.butEdit is not None: self.OnButEdit(idFrom, win32con.BN_CLICKED) def OnButRemove(self, id, code): if code == win32con.BN_CLICKED: self.SyncEnabledStates() index = self.GetSelectedRuleIndex() hitem = self.list.GetSelectedItem() name = self.rules[index].name result = self.parent.MessageBox("Are you sure you wish to delete rule '%s'?" % (name,), "Delete confirmation", win32con.MB_YESNO) if result==win32con.IDYES: self.list.DeleteItem(hitem) del self.rules[index] self.Refresh() def OnButAdd(self, id, code): if code == win32con.BN_CLICKED: new_rule = self.rule_factory() d = RuleDialog.RuleDialog(new_rule, self.parent.mgr) if d.DoModal()==win32con.IDOK: self.rules.append(new_rule) self.Refresh(len(self.rules)-1) def OnButEdit(self, id, code): if code == win32con.BN_CLICKED: self.SyncEnabledStates() index = self.GetSelectedRuleIndex() rule = copy.copy(self.rules[index]) d = RuleDialog.RuleDialog(rule, self.parent.mgr) if d.DoModal()==win32con.IDOK: self.rules[index] = rule self.Refresh() IDC_FOLDER_NAMES=1024 IDC_BROWSE=1025 IDC_BUT_DELETE=1026 IDC_BUT_NEW=1027 IDC_BUT_EDIT=1028 IDC_LIST_RULES=1029 IDC_BUT_FILTERNOW=1030 IDC_BUT_UNREAD=1031 class FilterArrivalsDialog(dialog.Dialog): style = win32con.DS_MODALFRAME | win32con.WS_POPUP | win32con.WS_VISIBLE | win32con.WS_CAPTION | win32con.WS_SYSMENU | win32con.DS_SETFONT cs = win32con.WS_CHILD | win32con.WS_VISIBLE treestyle = cs | win32con.WS_BORDER | commctrl.TVS_CHECKBOXES | commctrl.TVS_DISABLEDRAGDROP | commctrl.TVS_SHOWSELALWAYS filter_msg = "Filter the following folders as messages arrive" dt = [ # Dialog itself. ["Filters", (0, 0, 244, 198), style, None, (8, "MS Sans Serif")], # Children [STATIC, filter_msg, -1, ( 8, 9, 168, 11), cs], [STATIC, "", IDC_FOLDER_NAMES, ( 7, 20, 172, 12), cs | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE | win32con.SS_SUNKEN], [BUTTON, '&Browse', IDC_BROWSE, (187, 19, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, "Enabled Rules", -1, ( 7, 40, 230, 130), cs | win32con.BS_GROUPBOX], [BUTTON, "&New...", IDC_BUT_NEW, ( 60, 151, 50, 14), cs | win32con.WS_TABSTOP], [BUTTON, "&Delete", IDC_BUT_DELETE, ( 119,151, 50, 14), cs | win32con.WS_TABSTOP | win32con.WS_DISABLED], [BUTTON, "&Edit...", IDC_BUT_EDIT, ( 179,151, 50, 14), cs | win32con.WS_TABSTOP | win32con.WS_DISABLED], ["SysTreeView32", None, IDC_LIST_RULES, ( 14, 52, 216, 95), treestyle | win32con.WS_TABSTOP], [BUTTON, '&Filter Now...', IDC_BUT_FILTERNOW, ( 7, 177, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, 'Close', win32con.IDOK, (179, 177, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ] def __init__(self, mgr, rule_factory, filterer): self.mgr = mgr self.rule_factory = rule_factory self.filterer = filterer dialog.Dialog.__init__(self, self.dt) def OnInitDialog(self): self.list = RuleList(self, IDC_LIST_RULES, self.mgr.config.rules, self.rule_factory, IDC_BUT_NEW, IDC_BUT_DELETE, IDC_BUT_EDIT) self.HookCommand(self.OnButBrowse, IDC_BROWSE) self.HookCommand(self.OnButFilterNow, IDC_BUT_FILTERNOW) self.UpdateFolderNames() return dialog.Dialog.OnInitDialog(self) def OnOK(self): return dialog.Dialog.OnOK(self) def OnDestroy(self,msg): dialog.Dialog.OnDestroy(self, msg) self.list = None self.mgr = None def UpdateFolderNames(self): names = [] folder_ids = self.mgr.config.filter.folder_ids for eid in folder_ids: try: name = self.mgr.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" names.append(name) self.SetDlgItemText(IDC_FOLDER_NAMES, "; ".join(names)) def OnButBrowse(self, id, code): if code == win32con.BN_CLICKED: import FolderSelector filter = self.mgr.config.filter d = FolderSelector.FolderSelector(self.mgr.mapi, filter.folder_ids,checkbox_state=filter.include_sub) if d.DoModal()==win32con.IDOK: filter.folder_ids, filter.include_sub = d.GetSelectedIDs() self.UpdateFolderNames() def OnButFilterNow(self, id, code): if code == win32con.BN_CLICKED: self.list.SyncEnabledStates() self.list.PushEnabledStates() d = FilterNowDialog(self.mgr, self.rule_factory, self.filterer) d.DoModal() self.list.PopEnabledStates() self.list.Refresh() from AsyncDialog import * class FilterNowDialog(AsyncDialogBase): style = win32con.DS_MODALFRAME | win32con.WS_POPUP | win32con.WS_VISIBLE | win32con.WS_CAPTION | win32con.WS_SYSMENU | win32con.DS_SETFONT cs = win32con.WS_CHILD | win32con.WS_VISIBLE treestyle = cs | win32con.WS_BORDER | commctrl.TVS_CHECKBOXES | commctrl.TVS_DISABLEDRAGDROP | commctrl.TVS_SHOWSELALWAYS only_unread = "Only apply the filter to unread mail" process_start_text = "&Start filtering" process_stop_text = "&Stop filtering" dt = [ # Dialog itself. ["Filter Now", (0, 0, 244, 221), style, None, (8, "MS Sans Serif")], # Children [STATIC, "Filter the following folders", -1, ( 8, 9, 168, 11), cs], [STATIC, "", IDC_FOLDER_NAMES, ( 7, 20, 172, 12), cs | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE | win32con.SS_SUNKEN], [BUTTON, '&Browse', IDC_BROWSE, (187, 19, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, "Run the following rules", -1, ( 7, 40, 230, 113), cs | win32con.BS_GROUPBOX], ["SysTreeView32", None, IDC_LIST_RULES, ( 14, 52, 216, 95), treestyle | win32con.WS_TABSTOP], [BUTTON, only_unread, IDC_BUT_UNREAD, ( 15, 157, 149, 9), cs | win32con.BS_AUTOCHECKBOX | win32con.WS_TABSTOP], ["msctls_progress32", '', IDC_PROGRESS, ( 10, 170, 227, 11), cs | win32con.WS_BORDER], [STATIC, '', IDC_PROGRESS_TEXT, ( 10, 186, 227, 10), cs ], [BUTTON, process_start_text, IDC_START, ( 7, 200, 60, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, 'Close', win32con.IDOK, (187, 200, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ] disable_while_running_ids = [IDC_LIST_RULES, IDC_BUT_UNREAD, IDC_BROWSE, win32con.IDOK] def __init__(self, mgr, rule_factory, filterer): self.mgr = mgr self.filterer = filterer self.rule_factory = rule_factory AsyncDialogBase.__init__ (self, self.dt) def OnInitDialog(self): self.list = RuleList(self, IDC_LIST_RULES, self.mgr.config.rules, self.rule_factory) self.HookCommand(self.OnButBrowse, IDC_BROWSE) self.HookCommand(self.OnButUnread, IDC_BUT_UNREAD) if self.mgr.config.filter_now.only_unread: self.GetDlgItem(IDC_BUT_UNREAD).SetCheck(1) else: self.GetDlgItem(IDC_BUT_UNREAD).SetCheck(0) self.UpdateFolderNames() return AsyncDialogBase.OnInitDialog(self) def UpdateFolderNames(self): names = [] for eid in self.mgr.config.filter_now.folder_ids: try: name = self.mgr.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" names.append(name) self.SetDlgItemText(IDC_FOLDER_NAMES, "; ".join(names)) def OnButBrowse(self, id, code): if code == win32con.BN_CLICKED: import FolderSelector filter = self.mgr.config.filter_now d = FolderSelector.FolderSelector(self.mgr.mapi, filter.folder_ids,checkbox_state=filter.include_sub) if d.DoModal()==win32con.IDOK: filter.folder_ids, filter.include_sub = d.GetSelectedIDs() self.UpdateFolderNames() def OnButUnread(self, id, code): if code == win32con.BN_CLICKED: self.mgr.config.filter_now.only_unread = self.GetDlgItem(IDC_BUT_UNREAD).GetCheck() != 0 def StartProcess(self): return AsyncDialogBase.StartProcess(self) def _DoProcess(self): if self.filterer is None: print "Testing, testing, 1...2...3..." else: self.mgr.WorkerThreadStarting() try: self.filterer(self.mgr, self.progress, self.mgr.config.filter_now) finally: self.mgr.WorkerThreadEnding() if __name__=='__main__': from win32com.client import Dispatch import pythoncom mapi = Dispatch("MAPI.Session") mapi.Logon() class Config: pass class Manager: pass mgr = Manager() mgr.mapi = mapi mgr.config = config = Config() config.filter = Config() config.filter.folder_ids = [mapi.Inbox.ID] config.filter.include_sub = True config.filter_now=Config() config.filter_now.folder_ids = [mapi.Inbox.ID] config.filter_now.include_sub = True config.filter_now.only_unread= True class Rule: def __init__(self): self.enabled = True self.name = "My Rule" self.min = 0.1 self.max = 0.9 self.action = "Move" self.flag_message = True self.write_field = True self.write_field_name = "SpamProb" self.folder_id = "" def GetProblem(self, mgr): if self.min > self.max: return "max must be > min" config.rules = [Rule()] tester = FilterArrivalsDialog #tester = FilterNowDialog d = tester(mgr, Rule, None) d.DoModal() --- NEW FILE: FolderSelector.py --- from __future__ import generators from pywin.mfc import dialog import win32con import commctrl import win32ui import win32api from DialogGlobals import * # Helpers for building the folder list class FolderSpec: def __init__(self, folder, name = None): if name is None: self.name = folder.Name else: self.name = name self.name = self.name.encode("ascii", "replace") self.children = [] self.folder = folder def dump(self, level=0): prefix = " " * level print prefix + self.name for c in self.children: c.dump(level+1) def _BuildFolders(folders): children = [] folder = folders.GetFirst() while folder: spec = FolderSpec(folder) spec.children = _BuildFolders(folder.Folders) children.append(spec) folder = folders.GetNext() return children def BuildFolderTree(session): infostores = session.InfoStores root = FolderSpec(None, "root") for i in range(infostores.Count): infostore = infostores[i+1] rootFolder = infostore.RootFolder folders = rootFolder.Folders spec = FolderSpec(rootFolder, infostore.Name) spec.children = _BuildFolders(folders) root.children.append(spec) return root # # The dialog itself # IDs for controls we use. IDC_STATUS1 = win32ui.IDC_PROMPT1 IDC_STATUS2 = win32ui.IDC_PROMPT2 IDC_BUTTON_SEARCHSUB = win32ui.IDC_BUTTON1 IDC_BUTTON_CLEARALL = win32ui.IDC_BUTTON2 IDC_LIST_FOLDERS = win32ui.IDC_LIST1 class FolderSelector(dialog.Dialog): style = win32con.DS_MODALFRAME | win32con.WS_POPUP | win32con.WS_VISIBLE | win32con.WS_CAPTION | win32con.WS_SYSMENU | win32con.DS_SETFONT cs = win32con.WS_CHILD | win32con.WS_VISIBLE treestyle = cs | win32con.WS_BORDER | commctrl.TVS_HASLINES | commctrl.TVS_LINESATROOT | \ commctrl.TVS_CHECKBOXES | commctrl.TVS_HASBUTTONS | \ commctrl.TVS_DISABLEDRAGDROP | commctrl.TVS_SHOWSELALWAYS dt = [ # Dialog itself. ["", (0, 0, 247, 215), style, None, (8, "MS Sans Serif")], # Children [STATIC, "&Folders:", -1, (7, 7, 47, 9), cs ], ["SysTreeView32", None, IDC_LIST_FOLDERS, (7, 21, 172, 140), treestyle | win32con.WS_TABSTOP], [BUTTON, '', IDC_BUTTON_SEARCHSUB, (7, 167, 126, 9), cs | win32con.BS_AUTOCHECKBOX | win32con.WS_TABSTOP], [STATIC, "", IDC_STATUS1, (7, 180, 220, 9), cs ], [STATIC, "", IDC_STATUS2, (7, 194, 220, 9), cs ], [BUTTON, 'OK', win32con.IDOK, (190, 21, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, 'Cancel', win32con.IDCANCEL, (190, 39, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, 'C&lear All', IDC_BUTTON_CLEARALL, (190, 58, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], ] def __init__ (self, mapi, selected_ids = None, single_select = False, checkbox_state = False, checkbox_text = None, desc_noun = "Select", desc_noun_suffix = "ed"): assert single_select == False or selected_ids is None or len(selected_ids)<=1 dialog.Dialog.__init__ (self, self.dt) self.single_select = single_select self.next_item_id = 1 self.item_map = {} self.select_desc_noun = desc_noun self.select_desc_noun_suffix = desc_noun_suffix self.selected_ids = selected_ids self.mapi = mapi self.checkbox_state = checkbox_state self.checkbox_text = checkbox_text or "Include &subfolders" def _MakeItemParam(self, item): item_id = self.next_item_id self.next_item_id += 1 self.item_map[item_id] = item return item_id def _InsertSubFolders(self, hParent, folderSpec): num_children_selected = 0 for child in folderSpec.children: text = child.name cItems = len(child.children) if cItems==0: bitmapCol = bitmapSel = 5 # blank doc else: bitmapCol = bitmapSel = 0 # folder if self.single_select: mask = state = 0 else: if self.selected_ids and child.folder.ID in self.selected_ids: state = INDEXTOSTATEIMAGEMASK(IIL_CHECKED) num_children_selected += 1 else: state = INDEXTOSTATEIMAGEMASK(IIL_UNCHECKED) mask = commctrl.TVIS_STATEIMAGEMASK item_id = self._MakeItemParam(child) hitem = self.list.InsertItem(hParent, 0, (None, state, mask, text, bitmapCol, bitmapSel, cItems, item_id)) if self.single_select and self.selected_ids and child.folder.ID in self.selected_ids: self.list.SelectItem(hitem) num_children_selected += self._InsertSubFolders(hitem, child) if num_children_selected and hParent: self.list.Expand(hParent, commctrl.TVE_EXPAND) return num_children_selected def _YieldChildren(self, h): try: h = self.list.GetNextItem(h, commctrl.TVGN_CHILD) except win32ui.error: h = None while h is not None: info = self.list.GetItem(h) spec = self.item_map[info[7]] yield info, spec # Check children for info, spec in self._YieldChildren(h): yield info, spec try: h = self.list.GetNextItem(h, commctrl.TVGN_NEXT) except win32ui.error: h = None def _YieldAllChildren(self): return self._YieldChildren(commctrl.TVI_ROOT) def _YieldCheckedChildren(self): if self.single_select: # If single-select, the checked state is not used, just the selected state. try: h = self.list.GetSelectedItem() except win32ui.error: return info = self.list.GetItem(h) spec = self.item_map[info[7]] yield info, spec return # single-hit yield. for info, spec in self._YieldAllChildren(): checked = (info[1] >> 12) - 1 if checked: yield info, spec def OnInitDialog (self): caption = "%s folder" % (self.select_desc_noun,) if not self.single_select: caption += "(s)" self.SetWindowText(caption) self.SetDlgItemText(IDC_BUTTON_SEARCHSUB, self.checkbox_text) if self.checkbox_state is None: self.GetDlgItem(IDC_BUTTON_SEARCHSUB).ShowWindow(win32con.SW_HIDE) else: self.GetDlgItem(IDC_BUTTON_SEARCHSUB).SetCheck(self.checkbox_state) self.list = self.GetDlgItem(win32ui.IDC_LIST1) self.HookNotify(self.OnTreeItemExpanding, commctrl.TVN_ITEMEXPANDING) self.HookNotify(self.OnTreeItemSelChanged, commctrl.TVN_SELCHANGED) self.HookNotify(self.OnTreeItemClick, commctrl.NM_CLICK) self.HookNotify(self.OnTreeItemDoubleClick, commctrl.NM_DBLCLK) self.HookCommand(self.OnClearAll, IDC_BUTTON_CLEARALL) bitmapID = win32ui.IDB_HIERFOLDERS bitmapMask = win32api.RGB(0,0,255) self.imageList = win32ui.CreateImageList(bitmapID, 16, 0, bitmapMask) self.list.SetImageList(self.imageList, commctrl.LVSIL_NORMAL) if self.single_select: # Remove the checkbox style from the list for single-selection style = win32api.GetWindowLong(self.list.GetSafeHwnd(), win32con.GWL_STYLE) style = style & ~commctrl.TVS_CHECKBOXES win32api.SetWindowLong(self.list.GetSafeHwnd(), win32con.GWL_STYLE, style) # Hide "clear all" self.GetDlgItem(IDC_BUTTON_CLEARALL).ShowWindow(win32con.SW_HIDE) tree = BuildFolderTree(self.mapi) self._InsertSubFolders(0, tree) self.selected_ids = [] # wipe this out while we are alive. self._UpdateStatus() return dialog.Dialog.OnInitDialog (self) def OnDestroy(self, msg): self.item_map = None return dialog.Dialog.OnDestroy(self, msg) def OnClearAll(self, id, code): if code == win32con.BN_CLICKED: for info, spec in self._YieldCheckedChildren(): state = INDEXTOSTATEIMAGEMASK(IIL_UNCHECKED) mask = commctrl.TVIS_STATEIMAGEMASK self.list.SetItemState(info[0], state, mask) self._UpdateStatus() def _DoUpdateStatus(self, id, timeval): try: names = [] num_checked = 0 for info, spec in self._YieldCheckedChildren(): num_checked += 1 if len(names) < 20: names.append(info[3]) status_string = "%s%s %d folder" % (self.select_desc_noun, self.select_desc_noun_suffix, num_checked) if num_checked != 1: status_string += "s" self.SetDlgItemText(IDC_STATUS1, status_string) self.SetDlgItemText(IDC_STATUS2, "; ".join(names)) finally: import timer timer.kill_timer(id) def _UpdateStatus(self): import timer timer.set_timer (0, self._DoUpdateStatus) def OnOK(self): self.selected_ids, self.checkbox_state = self.GetSelectedIDs() return self._obj_.OnOK() def OnCancel(self): return self._obj_.OnCancel() def OnTreeItemDoubleClick(self,(hwndFrom, idFrom, code), extra): if idFrom != IDC_LIST_FOLDERS: return None if self.single_select: # Only close on double-click for single-select self.OnOK() return 0 def OnTreeItemClick(self,(hwndFrom, idFrom, code), extra): if idFrom != IDC_LIST_FOLDERS: return None self._UpdateStatus() return 0 def OnTreeItemExpanding(self,(hwndFrom, idFrom, code), extra): if idFrom != IDC_LIST_FOLDERS: return None action, itemOld, itemNew, pt = extra return 0 def OnTreeItemSelChanged(self,(hwndFrom, idFrom, code), extra): if idFrom != IDC_LIST_FOLDERS: return None action, itemOld, itemNew, pt = extra self._UpdateStatus() return 1 def GetSelectedIDs(self): try: self.GetDlgItem(IDC_LIST_FOLDERS) except win32ui.error: # dialog dead! return self.selected_ids, self.checkbox_state ret = [] for info, spec in self._YieldCheckedChildren(): ret.append(spec.folder.ID) return ret, self.GetDlgItem(IDC_BUTTON_SEARCHSUB).GetCheck() != 0 def TestWithMAPI(): from win32com.client import Dispatch mapi = Dispatch("MAPI.Session") mapi.Logon("", "", False, False) ids = [u'0000000071C4408983B0B24F8863EE66A8F79AFF82800000'] d=FolderSelector(mapi, ids, single_select = True) d.DoModal() print d.GetSelectedIDs() if __name__=='__main__': TestWithMAPI() --- NEW FILE: RuleDialog.py --- from pywin.mfc import dialog import win32con import commctrl import win32ui import win32api import pythoncom from DialogGlobals import * IDC_RULE_NAME = 1024 IDC_SLIDER_LOW = 1025 IDC_EDIT_LOW = 1026 IDC_SLIDER_HIGH = 1027 IDC_EDIT_HIGH = 1028 IDC_ACTION = 1029 IDC_FOLDER_NAME = 1030 IDC_BROWSE = 1031 IDC_FLAG = 1032 IDC_FIELD_NAME = 1033 IDC_WRITE_FIELD = 1034 class RuleDialog(dialog.Dialog): style = win32con.DS_MODALFRAME | win32con.WS_POPUP | win32con.WS_VISIBLE | win32con.WS_CAPTION | win32con.WS_SYSMENU | win32con.DS_SETFONT cs = win32con.WS_CHILD | win32con.WS_VISIBLE csts = cs | win32con.WS_TABSTOP treestyle = cs | win32con.WS_BORDER | commctrl.TVS_CHECKBOXES | commctrl.TVS_DISABLEDRAGDROP | commctrl.TVS_SHOWSELALWAYS filter_msg = "Filter the following folders as messages arrive" dt = [ # Dialog itself. ["Define Rule", (0, 0, 249, 199), style, None, (8, "MS Sans Serif")], # Children [STATIC, "Enter a name for the filter", -1, ( 7, 6, 94, 11), cs], [EDIT, "", IDC_RULE_NAME, (120, 6, 118, 14), csts | win32con.ES_AUTOHSCROLL | win32con.WS_BORDER], [STATIC, "When the spam rating is between", -1, ( 7, 23, 107, 10), cs], ["msctls_trackbar32", "", IDC_SLIDER_LOW, ( 7, 38, 112, 8), cs | commctrl.TBS_BOTH | commctrl.TBS_NOTICKS], [EDIT, "", IDC_EDIT_LOW, (120, 34, 59, 14), csts | win32con.ES_AUTOHSCROLL | win32con.WS_BORDER], [STATIC, "and", -1, ( 7, 46, 107, 10), cs], ["msctls_trackbar32", "", IDC_SLIDER_HIGH, ( 7, 57, 112, 8), cs | commctrl.TBS_BOTH | commctrl.TBS_NOTICKS], [EDIT, "", IDC_EDIT_HIGH, (120, 54, 59, 14), csts | win32con.ES_AUTOHSCROLL | win32con.WS_BORDER], [STATIC, "Take the following actions", -1, ( 7, 72, 107, 10), cs], [BUTTON, "Copy/Move message", -1, ( 7, 86, 235, 35), cs | win32con.BS_GROUPBOX], [COMBOBOX, "", IDC_ACTION, ( 14, 97, 55, 40), csts | win32con.CBS_DROPDOWNLIST | win32con.WS_VSCROLL], [STATIC, "to folder", -1, ( 79, 99, 31, 10), cs], [STATIC, "", IDC_FOLDER_NAME, ( 120, 97, 59, 14), cs | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE | win32con.SS_SUNKEN], [BUTTON, '&Browse', IDC_BROWSE, (186, 97, 50, 14), csts | win32con.BS_PUSHBUTTON], [BUTTON, "Modify message", -1, ( 7, 129, 235, 45), cs | win32con.BS_GROUPBOX], [BUTTON, "Create a flag on the message", IDC_FLAG, ( 11, 137, 109, 16), csts | win32con.BS_AUTOCHECKBOX], [BUTTON, "Write spam score to field", IDC_WRITE_FIELD, (11,151,108, 15), csts | win32con.BS_AUTOCHECKBOX], [EDIT, "", IDC_FIELD_NAME, (120, 152, 59, 14), csts | win32con.ES_AUTOHSCROLL | win32con.WS_BORDER], [BUTTON, 'OK', win32con.IDOK, (129, 178, 50, 14), csts | win32con.BS_DEFPUSHBUTTON], [BUTTON, 'Cancel', win32con.IDCANCEL, (192, 178, 50, 14), csts | win32con.BS_PUSHBUTTON], ] def __init__(self, rule, mgr = None): self.rule = rule self.mgr = mgr self.folder_id = rule.folder_id dialog.Dialog.__init__ (self, self.dt) def OnInitDialog(self): rule = self.rule self.SetDlgItemText(IDC_RULE_NAME, rule.name) self.SetDlgItemText(IDC_EDIT_LOW, "%.2f" % rule.min) self.SetDlgItemText(IDC_EDIT_HIGH, "%.2f" % rule.max) self.GetDlgItem(IDC_FLAG).SetCheck(rule.flag_message) self.GetDlgItem(IDC_WRITE_FIELD).SetCheck(rule.write_field) edit = self.GetDlgItem(IDC_FIELD_NAME) edit.SetWindowText(rule.write_field_name) edit.EnableWindow(rule.write_field) self._InitSlider(IDC_SLIDER_HIGH, IDC_EDIT_HIGH) self._InitSlider(IDC_SLIDER_LOW, IDC_EDIT_LOW) self.HookMessage (self.OnSlider, win32con.WM_HSCROLL) self.HookCommand(self.OnEditChange, IDC_EDIT_HIGH) self.HookCommand(self.OnEditChange, IDC_EDIT_LOW) self.HookCommand(self.OnButWriteField, IDC_WRITE_FIELD) self.HookCommand(self.OnButBrowse, IDC_BROWSE) self._UpdateFolderName() combo = self.GetDlgItem(IDC_ACTION) index = sel_index = 0 for s in ["None", "Move", "Copy"]: combo.AddString(s) if s == rule.action: sel_index = index index+=1 combo.SetCurSel(sel_index) return dialog.Dialog.OnInitDialog(self) def _UpdateFolderName(self): try: if not self.folder_id: name = "" elif self.mgr.mapi is None: name = "" else: name = self.mgr.mapi.GetFolder(self.folder_id).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" self.SetDlgItemText(IDC_FOLDER_NAME, name) def OnEditChange(self, controlid, code): if code==win32con.EN_CHANGE: if controlid == IDC_EDIT_HIGH: sliderid = IDC_SLIDER_HIGH else: sliderid = IDC_SLIDER_LOW self._AdjustSliderToEdit(sliderid, controlid) return 1 # I handled this, so no need to call defaults! def OnButWriteField(self, id, code): if code == win32con.BN_CLICKED: edit = self.GetDlgItem(IDC_FIELD_NAME) edit.EnableWindow( self.GetDlgItem(IDC_WRITE_FIELD).GetCheck() ) return 1 def OnButBrowse(self, id, code): if code == win32con.BN_CLICKED: import FolderSelector ids = [self.folder_id] d = FolderSelector.FolderSelector(self.mgr.mapi, ids,single_select=True,checkbox_state=None)#, allow_multi=False) if d.DoModal()==win32con.IDOK: new_ids, cb_state = d.GetSelectedIDs() if new_ids: self.folder_id = new_ids[0] self._UpdateFolderName() return 1 def OnSlider(self, params): lParam = params[3] slider = self.GetDlgItem(IDC_SLIDER_HIGH) if slider.GetSafeHwnd() == lParam: idc_edit = IDC_EDIT_HIGH else: slider = self.GetDlgItem(IDC_SLIDER_LOW) assert slider.GetSafeHwnd() == lParam idc_edit = IDC_EDIT_LOW self.SetDlgItemText(idc_edit, "%.2f" % (slider.GetPos() / 100.0)) def _InitSlider(self, idc_slider, idc_edit): slider = self.GetDlgItem(idc_slider) slider.SetRange(0, 100, 0) slider.SetLineSize(1) slider.SetPageSize(5) self._AdjustSliderToEdit(idc_slider, idc_edit) def _AdjustSliderToEdit(self, idc_slider, idc_edit): slider = self.GetDlgItem(idc_slider) edit = self.GetDlgItem(idc_edit) try: fval = float(edit.GetWindowText()) except ValueError: return slider.SetPos(int(fval*100)) def _CheckEdit(self, idc, rule, attr): try: val = float(self.GetDlgItemText(idc)) if val < 0 or val > 1.0: raise ValueError except ValueError: self.MessageBox("Please enter a number between 0 and 1") self.GetDlgItem(idc).SetFocus() return False setattr(rule, attr, val) return True def OnOK(self): rule = self.rule if not self._CheckEdit(IDC_EDIT_HIGH, rule, "max") or \ not self._CheckEdit(IDC_EDIT_LOW, rule, "min"): return 1 combo = self.GetDlgItem(IDC_ACTION) rule.name = self.GetDlgItemText(IDC_RULE_NAME) rule.action = combo.GetLBText(combo.GetCurSel()) rule.flag_message = self.GetDlgItem(IDC_FLAG).GetCheck() rule.write_field = self.GetDlgItem(IDC_WRITE_FIELD).GetCheck() rule.write_field_name = self.GetDlgItemText(IDC_FIELD_NAME) rule.folder_id = self.folder_id problem = rule.GetProblem(self.mgr) if problem is not None: self.MessageBox(problem) return 1 return self._obj_.OnOK() def OnCancel(self): return self._obj_.OnCancel() if __name__=='__main__': from win32com.client import Dispatch try: mapi = Dispatch("MAPI.Session") mapi.Logon() except pythoncom.com_error: mapi = None class Rule: def __init__(self): self.name = "My Rule" self.min = 0.1 self.max = 0.9 self.action = "Move" self.flag_message = True self.write_field = True self.write_field_name = "SpamProb" self.folder_id = "" def GetProblem(self, mgr): if self.min > self.max: return "max must be > min" class Manager: pass mgr = Manager() mgr.mapi = mapi rule = Rule() d = RuleDialog(rule, mgr) if d.DoModal() == win32con.IDOK: print "Name:", rule.name print "min,max:", rule.min, rule.max print "Action:", rule.action print "Write Field:", rule.write_field, ", to:", rule.write_field_name print "Flag message:", rule.flag_message --- NEW FILE: TrainingDialog.py --- from pywin.mfc import dialog import win32con import commctrl import win32ui import win32api #these are the atom numbers defined by Windows for basic dialog controls BUTTON = 0x80 EDIT = 0x81 STATIC = 0x82 LISTBOX = 0x83 SCROLLBAR = 0x84 COMBOBOX = 0x85 IDC_STATIC_HAM = 1001 IDC_BROWSE_HAM = 1002 IDC_STATIC_SPAM = 1003 IDC_BROWSE_SPAM = 1004 from AsyncDialog import IDC_START, IDC_PROGRESS, IDC_PROGRESS_TEXT, AsyncDialogBase class TrainingDialog(AsyncDialogBase): style = win32con.DS_MODALFRAME | win32con.WS_POPUP | win32con.WS_VISIBLE | win32con.WS_CAPTION | win32con.WS_SYSMENU | win32con.DS_SETFONT cs = win32con.WS_CHILD | win32con.WS_VISIBLE ham_title = "Folders with known good messages" spam_title = "Folders with known spam or other junk messages" process_start_text = "&Train now" process_stop_text = "Stop &training" dt = [ # Dialog itself. ["Training", (0, 0, 241, 118), style, None, (8, "MS Sans Serif")], # Children [STATIC, ham_title, -1, ( 7, 6, 131, 11), cs ], [STATIC, "", IDC_STATIC_HAM, ( 7, 17, 167, 12), cs | win32con.SS_SUNKEN | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE], [BUTTON, '&Browse', IDC_BROWSE_HAM, (184, 17, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [STATIC, spam_title, -1, ( 7, 36, 171, 9), cs ], [STATIC, "", IDC_STATIC_SPAM, ( 7, 47, 167, 12), cs | win32con.SS_SUNKEN | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE], [BUTTON, 'Brow&se', IDC_BROWSE_SPAM, (184, 47, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, process_start_text, IDC_START, ( 7, 97, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], ["msctls_progress32", '', IDC_PROGRESS, ( 7, 68, 166, 11), cs | win32con.WS_BORDER], [STATIC, '', IDC_PROGRESS_TEXT, ( 7, 84, 227, 10), cs ], [BUTTON, 'Close', win32con.IDOK, (184, 97, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ] disable_while_running_ids = [IDC_BROWSE_HAM, IDC_BROWSE_SPAM, win32con.IDOK] def __init__ (self, mgr, trainer): self.mgr = mgr self.trainer = trainer self.config = mgr.config.training self.mapi = mgr.mapi AsyncDialogBase.__init__ (self, self.dt) def OnInitDialog(self): self.HookCommand(self.OnBrowse, IDC_BROWSE_SPAM) self.HookCommand(self.OnBrowse, IDC_BROWSE_HAM) self.UpdateStatus() return AsyncDialogBase.OnInitDialog (self) def UpdateStatus(self): names = [] for eid in self.config.ham_folder_ids: try: name = self.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" names.append(name) self.SetDlgItemText(IDC_STATIC_HAM, "; ".join(names)) names = [] for eid in self.config.spam_folder_ids: try: name = self.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" names.append(name) self.SetDlgItemText(IDC_STATIC_SPAM, "; ".join(names)) def OnBrowse(self, id, code): if code == win32con.BN_CLICKED: import FolderSelector if id==IDC_BROWSE_SPAM: l = self.config.spam_folder_ids sub_attr = "spam_include_sub" else: l = self.config.ham_folder_ids sub_attr = "ham_include_sub" include_sub = getattr(self.config, sub_attr) d = FolderSelector.FolderSelector(self.mapi, l, checkbox_state=include_sub) if d.DoModal()==win32con.IDOK: l[:], include_sub = d.GetSelectedIDs()[:] setattr(self.config, sub_attr, include_sub) self.UpdateStatus() def _DoProcess(self): self.mgr.WorkerThreadStarting() try: self.trainer(self.mgr, self.progress) finally: self.mgr.WorkerThreadEnding() ##if __name__=='__main__': ## d=TrainingDialog(None) ## d.DoModal() --- NEW FILE: __init__.py --- # This package defines dialog boxes used by the main # SpamBayes Outlook 2k integration code. From mhammond@users.sourceforge.net Sat Oct 19 17:24:44 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sat, 19 Oct 2002 09:24:44 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 default_bayes_customize.ini,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv20555 Added Files: default_bayes_customize.ini Log Message: Default ini file. --- NEW FILE: default_bayes_customize.ini --- # This is the default INI file for the Outlook addin Bayes database # This file must exist, or the addin considers itself confused. # As we decide default options, we can add them! From mhammond@users.sourceforge.net Sat Oct 19 17:25:32 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sat, 19 Oct 2002 09:25:32 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 spam.py,1.1,NONE Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv20789 Removed Files: spam.py Log Message: Functionality replaced by filter.py --- spam.py DELETED --- From tim_one@users.sourceforge.net Sat Oct 19 18:20:12 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 10:20:12 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv9173 Modified Files: manager.py Log Message: Get generators from the future, so this can work with 2.2.2. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** manager.py 19 Oct 2002 16:23:37 -0000 1.1 --- manager.py 19 Oct 2002 17:20:07 -0000 1.2 *************** *** 1,2 **** --- 1,4 ---- + from __future__ import generators + import cPickle import os *************** *** 220,222 **** print "Error initializing Bayes manager" print d ! \ No newline at end of file --- 222,224 ---- print "Error initializing Bayes manager" print d ! From tim_one@users.sourceforge.net Sat Oct 19 19:14:03 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 11:14:03 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 classify.py,1.1,1.2 filter.py,1.2,1.3 manager.py,1.2,1.3 rule.py,1.1,1.2 train.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv29154 Modified Files: classify.py filter.py manager.py rule.py train.py Log Message: Whitespace normalization. Index: classify.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** classify.py 19 Oct 2002 16:23:36 -0000 1.1 --- classify.py 19 Oct 2002 18:14:01 -0000 1.2 *************** *** 48,52 **** if not added_prop: message.Fields.Add(config.field_name, 5, prob) ! message.Update() except pythoncom.com_error, d: --- 48,52 ---- if not added_prop: message.Fields.Add(config.field_name, 5, prob) ! message.Update() except pythoncom.com_error, d: *************** *** 78,82 **** import manager mgr = manager.GetManager() ! import dialogs.ClassifyDialog d = dialogs.ClassifyDialog.ClassifyDialog(mgr, classifier) --- 78,82 ---- import manager mgr = manager.GetManager() ! import dialogs.ClassifyDialog d = dialogs.ClassifyDialog.ClassifyDialog(mgr, classifier) Index: filter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** filter.py 19 Oct 2002 16:23:37 -0000 1.2 --- filter.py 19 Oct 2002 18:14:01 -0000 1.3 *************** *** 21,25 **** if only_unread and not message.Unread: continue ! try: headers = message.Fields[0x7D001E].Value --- 21,25 ---- if only_unread and not message.Unread: continue ! try: headers = message.Fields[0x7D001E].Value *************** *** 46,50 **** return num_messages ! def filterer(mgr, progress, filter): if not filter.folder_ids: --- 46,50 ---- return num_messages ! def filterer(mgr, progress, filter): if not filter.folder_ids: *************** *** 69,73 **** import manager mgr = manager.GetManager() ! import dialogs.FilterDialog d = dialogs.FilterDialog.FilterArrivalsDialog(mgr, rule.Rule, filterer) --- 69,73 ---- import manager mgr = manager.GetManager() ! import dialogs.FilterDialog d = dialogs.FilterDialog.FilterArrivalsDialog(mgr, rule.Rule, filterer) Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** manager.py 19 Oct 2002 17:20:07 -0000 1.2 --- manager.py 19 Oct 2002 18:14:01 -0000 1.3 *************** *** 43,47 **** self.outlook = outlook os.chdir(cwd) ! self.LoadBayes() --- 43,47 ---- self.outlook = outlook os.chdir(cwd) ! self.LoadBayes() *************** *** 63,67 **** self._tls[thread.get_ident()]["outlook"] = existing return existing ! def LoadBayes(self): if not os.path.exists(self.ini_filename): --- 63,67 ---- self._tls[thread.get_ident()]["outlook"] = existing return existing ! def LoadBayes(self): if not os.path.exists(self.ini_filename): *************** *** 100,106 **** def InitNewBayes(self): ! os.environ["BAYESCUSTOMIZE"]=self.ini_filename ! self.bayes = classifier.Bayes() ! self.bayes_dirty = True def SaveBayes(self): --- 100,106 ---- def InitNewBayes(self): ! os.environ["BAYESCUSTOMIZE"]=self.ini_filename ! self.bayes = classifier.Bayes() ! self.bayes_dirty = True def SaveBayes(self): *************** *** 182,186 **** else: d(name, level+1) ! class _ConfigurationRoot(_ConfigurationContainer): def __init__(self): --- 182,186 ---- else: d(name, level+1) ! class _ConfigurationRoot(_ConfigurationContainer): def __init__(self): *************** *** 206,210 **** ) self.rules = [] ! _mgr = None --- 206,210 ---- ) self.rules = [] ! _mgr = None *************** *** 222,224 **** print "Error initializing Bayes manager" print d - --- 222,223 ---- Index: rule.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/rule.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** rule.py 19 Oct 2002 16:23:37 -0000 1.1 --- rule.py 19 Oct 2002 18:14:01 -0000 1.2 *************** *** 50,59 **** outlook_message.FlagStatus = constants.olFlagMarked outlook_message.Save() ! if self.write_field: format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format) prop.Value = prob outlook_message.Save() ! if self.action == "None": pass --- 50,59 ---- outlook_message.FlagStatus = constants.olFlagMarked outlook_message.Save() ! if self.write_field: format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format) prop.Value = prob outlook_message.Save() ! if self.action == "None": pass *************** *** 67,69 **** return True - --- 67,68 ---- Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** train.py 19 Oct 2002 16:23:37 -0000 1.2 --- train.py 19 Oct 2002 18:14:01 -0000 1.3 *************** *** 77,81 **** d.DoModal() ! mgr.Save() mgr.Close() --- 77,81 ---- d.DoModal() ! mgr.Save() mgr.Close() From tim_one@users.sourceforge.net Sat Oct 19 19:14:04 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 11:14:04 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs AsyncDialog.py,1.1,1.2 ClassifyDialog.py,1.1,1.2 FilterDialog.py,1.1,1.2 FolderSelector.py,1.1,1.2 RuleDialog.py,1.1,1.2 TrainingDialog.py,1.1,1.2 __init__.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv29154/dialogs Modified Files: AsyncDialog.py ClassifyDialog.py FilterDialog.py FolderSelector.py RuleDialog.py TrainingDialog.py __init__.py Log Message: Whitespace normalization. Index: AsyncDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/AsyncDialog.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** AsyncDialog.py 19 Oct 2002 16:24:13 -0000 1.1 --- AsyncDialog.py 19 Oct 2002 18:14:01 -0000 1.2 *************** *** 93,118 **** def StartProcess(self): ! if self.running: ! self.progress.request_stop() ! else: ! for id in self.disable_while_running_ids: ! self.GetDlgItem(id).EnableWindow(0) ! self.SetDlgItemText(IDC_START, self.process_stop_text) ! self.SetDlgItemText(IDC_PROGRESS_TEXT, "") ! self.GetDlgItem(IDC_PROGRESS).ShowWindow(win32con.SW_SHOW) ! # Local function for the thread target that notifies us when finished. ! def thread_target(h, progress): ! try: ! self.progress = progress ! self.seen_finished = False ! self.running = True ! self._DoProcess() ! finally: ! win32api.PostMessage(h, MYWM_FINISHED, self.progress.stop_requested()) ! self.running = False ! self.progress = None ! # back to the program :) ! import threading ! t = threading.Thread(target=thread_target, args =(self.GetSafeHwnd(), _Progress(self))) ! t.start() --- 93,118 ---- def StartProcess(self): ! if self.running: ! self.progress.request_stop() ! else: ! for id in self.disable_while_running_ids: ! self.GetDlgItem(id).EnableWindow(0) ! self.SetDlgItemText(IDC_START, self.process_stop_text) ! self.SetDlgItemText(IDC_PROGRESS_TEXT, "") ! self.GetDlgItem(IDC_PROGRESS).ShowWindow(win32con.SW_SHOW) ! # Local function for the thread target that notifies us when finished. ! def thread_target(h, progress): ! try: ! self.progress = progress ! self.seen_finished = False ! self.running = True ! self._DoProcess() ! finally: ! win32api.PostMessage(h, MYWM_FINISHED, self.progress.stop_requested()) ! self.running = False ! self.progress = None ! # back to the program :) ! import threading ! t = threading.Thread(target=thread_target, args =(self.GetSafeHwnd(), _Progress(self))) ! t.start() Index: ClassifyDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/ClassifyDialog.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ClassifyDialog.py 19 Oct 2002 16:24:13 -0000 1.1 --- ClassifyDialog.py 19 Oct 2002 18:14:01 -0000 1.2 *************** *** 32,36 **** [STATIC, info_text, -1, ( 7, 6, 227, 16), cs ], [STATIC, classify_text, -1, ( 7, 29, 131, 11), cs ], ! [STATIC, "", IDC_STATIC_FOLDERS, ( 7, 40, 167, 12), cs | win32con.SS_SUNKEN | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE], [BUTTON, '&Browse', IDC_BROWSE, (184, 40, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], --- 32,36 ---- [STATIC, info_text, -1, ( 7, 6, 227, 16), cs ], [STATIC, classify_text, -1, ( 7, 29, 131, 11), cs ], ! [STATIC, "", IDC_STATIC_FOLDERS, ( 7, 40, 167, 12), cs | win32con.SS_SUNKEN | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE], [BUTTON, '&Browse', IDC_BROWSE, (184, 40, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], *************** *** 44,48 **** [BUTTON, 'Close', win32con.IDOK, (184, 109, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ! ] disable_while_running_ids = [IDC_FIELDNAME, IDC_BROWSE, win32con.IDOK] --- 44,48 ---- [BUTTON, 'Close', win32con.IDOK, (184, 109, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ! ] disable_while_running_ids = [IDC_FIELDNAME, IDC_BROWSE, win32con.IDOK] Index: FilterDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FilterDialog.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** FilterDialog.py 19 Oct 2002 16:24:13 -0000 1.1 --- FilterDialog.py 19 Oct 2002 18:14:01 -0000 1.2 *************** *** 31,35 **** parent.HookCommand(self.OnButAdd, idc_add) self.butAdd = parent.GetDlgItem(idc_add) ! if idc_remove is None: self.butRemove = None else: --- 31,35 ---- parent.HookCommand(self.OnButAdd, idc_add) self.butAdd = parent.GetDlgItem(idc_add) ! if idc_remove is None: self.butRemove = None else: *************** *** 40,44 **** parent.HookCommand(self.OnButEdit, idc_edit) self.butEdit = parent.GetDlgItem(idc_edit) ! self.Refresh() --- 40,44 ---- parent.HookCommand(self.OnButEdit, idc_edit) self.butEdit = parent.GetDlgItem(idc_edit) ! self.Refresh() *************** *** 142,146 **** self.SyncEnabledStates() index = self.GetSelectedRuleIndex() ! rule = copy.copy(self.rules[index]) d = RuleDialog.RuleDialog(rule, self.parent.mgr) --- 142,146 ---- self.SyncEnabledStates() index = self.GetSelectedRuleIndex() ! rule = copy.copy(self.rules[index]) d = RuleDialog.RuleDialog(rule, self.parent.mgr) *************** *** 175,179 **** [BUTTON, "&Edit...", IDC_BUT_EDIT, ( 179,151, 50, 14), cs | win32con.WS_TABSTOP | win32con.WS_DISABLED], ["SysTreeView32", None, IDC_LIST_RULES, ( 14, 52, 216, 95), treestyle | win32con.WS_TABSTOP], ! [BUTTON, '&Filter Now...', IDC_BUT_FILTERNOW, ( 7, 177, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, 'Close', win32con.IDOK, (179, 177, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], --- 175,179 ---- [BUTTON, "&Edit...", IDC_BUT_EDIT, ( 179,151, 50, 14), cs | win32con.WS_TABSTOP | win32con.WS_DISABLED], ["SysTreeView32", None, IDC_LIST_RULES, ( 14, 52, 216, 95), treestyle | win32con.WS_TABSTOP], ! [BUTTON, '&Filter Now...', IDC_BUT_FILTERNOW, ( 7, 177, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, 'Close', win32con.IDOK, (179, 177, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], *************** *** 185,189 **** self.filterer = filterer dialog.Dialog.__init__(self, self.dt) ! def OnInitDialog(self): self.list = RuleList(self, IDC_LIST_RULES, self.mgr.config.rules, self.rule_factory, IDC_BUT_NEW, IDC_BUT_DELETE, IDC_BUT_EDIT) --- 185,189 ---- self.filterer = filterer dialog.Dialog.__init__(self, self.dt) ! def OnInitDialog(self): self.list = RuleList(self, IDC_LIST_RULES, self.mgr.config.rules, self.rule_factory, IDC_BUT_NEW, IDC_BUT_DELETE, IDC_BUT_EDIT) *************** *** 252,256 **** ["msctls_progress32", '', IDC_PROGRESS, ( 10, 170, 227, 11), cs | win32con.WS_BORDER], [STATIC, '', IDC_PROGRESS_TEXT, ( 10, 186, 227, 10), cs ], ! [BUTTON, process_start_text, IDC_START, ( 7, 200, 60, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, 'Close', win32con.IDOK, (187, 200, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], --- 252,256 ---- ["msctls_progress32", '', IDC_PROGRESS, ( 10, 170, 227, 11), cs | win32con.WS_BORDER], [STATIC, '', IDC_PROGRESS_TEXT, ( 10, 186, 227, 10), cs ], ! [BUTTON, process_start_text, IDC_START, ( 7, 200, 60, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], [BUTTON, 'Close', win32con.IDOK, (187, 200, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], *************** *** 263,267 **** self.rule_factory = rule_factory AsyncDialogBase.__init__ (self, self.dt) ! def OnInitDialog(self): self.list = RuleList(self, IDC_LIST_RULES, self.mgr.config.rules, self.rule_factory) --- 263,267 ---- self.rule_factory = rule_factory AsyncDialogBase.__init__ (self, self.dt) ! def OnInitDialog(self): self.list = RuleList(self, IDC_LIST_RULES, self.mgr.config.rules, self.rule_factory) Index: FolderSelector.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FolderSelector.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** FolderSelector.py 19 Oct 2002 16:24:13 -0000 1.1 --- FolderSelector.py 19 Oct 2002 18:14:01 -0000 1.2 *************** *** 84,88 **** self.next_item_id = 1 self.item_map = {} ! self.select_desc_noun = desc_noun self.select_desc_noun_suffix = desc_noun_suffix --- 84,88 ---- self.next_item_id = 1 self.item_map = {} ! self.select_desc_noun = desc_noun self.select_desc_noun_suffix = desc_noun_suffix *************** *** 157,165 **** yield info, spec return # single-hit yield. ! for info, spec in self._YieldAllChildren(): ! checked = (info[1] >> 12) - 1 ! if checked: ! yield info, spec def OnInitDialog (self): --- 157,165 ---- yield info, spec return # single-hit yield. ! for info, spec in self._YieldAllChildren(): ! checked = (info[1] >> 12) - 1 ! if checked: ! yield info, spec def OnInitDialog (self): Index: RuleDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/RuleDialog.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** RuleDialog.py 19 Oct 2002 16:24:13 -0000 1.1 --- RuleDialog.py 19 Oct 2002 18:14:01 -0000 1.2 *************** *** 51,55 **** [BUTTON, "Write spam score to field", IDC_WRITE_FIELD, (11,151,108, 15), csts | win32con.BS_AUTOCHECKBOX], [EDIT, "", IDC_FIELD_NAME, (120, 152, 59, 14), csts | win32con.ES_AUTOHSCROLL | win32con.WS_BORDER], ! [BUTTON, 'OK', win32con.IDOK, (129, 178, 50, 14), csts | win32con.BS_DEFPUSHBUTTON], [BUTTON, 'Cancel', win32con.IDCANCEL, (192, 178, 50, 14), csts | win32con.BS_PUSHBUTTON], --- 51,55 ---- [BUTTON, "Write spam score to field", IDC_WRITE_FIELD, (11,151,108, 15), csts | win32con.BS_AUTOCHECKBOX], [EDIT, "", IDC_FIELD_NAME, (120, 152, 59, 14), csts | win32con.ES_AUTOHSCROLL | win32con.WS_BORDER], ! [BUTTON, 'OK', win32con.IDOK, (129, 178, 50, 14), csts | win32con.BS_DEFPUSHBUTTON], [BUTTON, 'Cancel', win32con.IDCANCEL, (192, 178, 50, 14), csts | win32con.BS_PUSHBUTTON], *************** *** 61,65 **** self.folder_id = rule.folder_id dialog.Dialog.__init__ (self, self.dt) ! def OnInitDialog(self): rule = self.rule --- 61,65 ---- self.folder_id = rule.folder_id dialog.Dialog.__init__ (self, self.dt) ! def OnInitDialog(self): rule = self.rule *************** *** 72,76 **** edit.SetWindowText(rule.write_field_name) edit.EnableWindow(rule.write_field) ! self._InitSlider(IDC_SLIDER_HIGH, IDC_EDIT_HIGH) self._InitSlider(IDC_SLIDER_LOW, IDC_EDIT_LOW) --- 72,76 ---- edit.SetWindowText(rule.write_field_name) edit.EnableWindow(rule.write_field) ! self._InitSlider(IDC_SLIDER_HIGH, IDC_EDIT_HIGH) self._InitSlider(IDC_SLIDER_LOW, IDC_EDIT_LOW) *************** *** 140,144 **** idc_edit = IDC_EDIT_LOW self.SetDlgItemText(idc_edit, "%.2f" % (slider.GetPos() / 100.0)) ! def _InitSlider(self, idc_slider, idc_edit): slider = self.GetDlgItem(idc_slider) --- 140,144 ---- idc_edit = IDC_EDIT_LOW self.SetDlgItemText(idc_edit, "%.2f" % (slider.GetPos() / 100.0)) ! def _InitSlider(self, idc_slider, idc_edit): slider = self.GetDlgItem(idc_slider) *************** *** 168,172 **** setattr(rule, attr, val) return True ! def OnOK(self): rule = self.rule --- 168,172 ---- setattr(rule, attr, val) return True ! def OnOK(self): rule = self.rule *************** *** 213,217 **** mgr = Manager() mgr.mapi = mapi ! rule = Rule() --- 213,217 ---- mgr = Manager() mgr.mapi = mapi ! rule = Rule() Index: TrainingDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/TrainingDialog.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** TrainingDialog.py 19 Oct 2002 16:24:13 -0000 1.1 --- TrainingDialog.py 19 Oct 2002 18:14:01 -0000 1.2 *************** *** 44,48 **** [BUTTON, 'Close', win32con.IDOK, (184, 97, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ! ] disable_while_running_ids = [IDC_BROWSE_HAM, IDC_BROWSE_SPAM, win32con.IDOK] --- 44,48 ---- [BUTTON, 'Close', win32con.IDOK, (184, 97, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ! ] disable_while_running_ids = [IDC_BROWSE_HAM, IDC_BROWSE_SPAM, win32con.IDOK] *************** *** 102,106 **** finally: self.mgr.WorkerThreadEnding() ! ##if __name__=='__main__': ## d=TrainingDialog(None) --- 102,106 ---- finally: self.mgr.WorkerThreadEnding() ! ##if __name__=='__main__': ## d=TrainingDialog(None) Index: __init__.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/__init__.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** __init__.py 19 Oct 2002 16:24:13 -0000 1.1 --- __init__.py 19 Oct 2002 18:14:01 -0000 1.2 *************** *** 1,2 **** ! # This package defines dialog boxes used by the main # SpamBayes Outlook 2k integration code. --- 1,2 ---- ! # This package defines dialog boxes used by the main # SpamBayes Outlook 2k integration code. From tim_one@users.sourceforge.net Sat Oct 19 23:30:05 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 15:30:05 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 classify.py,1.2,1.3 filter.py,1.3,1.4 manager.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv13987 Modified Files: classify.py filter.py manager.py Log Message: Changed from 1 to True in a few appropriate places, + a little whitespace fiddling. Index: classify.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** classify.py 19 Oct 2002 18:14:01 -0000 1.2 --- classify.py 19 Oct 2002 22:30:02 -0000 1.3 *************** *** 31,35 **** text = headers + body ! prob, clues = hammie.score(text, evidence=1) added_prop = False try: --- 31,35 ---- text = headers + body ! prob, clues = hammie.score(text, evidence=True) added_prop = False try: Index: filter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** filter.py 19 Oct 2002 18:14:01 -0000 1.3 --- filter.py 19 Oct 2002 22:30:02 -0000 1.4 *************** *** 31,35 **** continue ! prob, clues = hammie.score(text, evidence=1) did_this_message = False for rule in mgr.config.rules: --- 31,35 ---- continue ! prob, clues = hammie.score(text, evidence=True) did_this_message = False for rule in mgr.config.rules: Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** manager.py 19 Oct 2002 18:14:01 -0000 1.3 --- manager.py 19 Oct 2002 22:30:02 -0000 1.4 *************** *** 24,28 **** class BayesManager: ! def __init__(self, config_base = "default", outlook = None, verbose = 1): self.verbose = verbose if not os.path.isabs(config_base): --- 24,28 ---- class BayesManager: ! def __init__(self, config_base="default", outlook=None, verbose=True): self.verbose = verbose if not os.path.isabs(config_base): From tim_one@users.sourceforge.net Sat Oct 19 23:30:05 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 15:30:05 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs RuleDialog.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv13987/dialogs Modified Files: RuleDialog.py Log Message: Changed from 1 to True in a few appropriate places, + a little whitespace fiddling. Index: RuleDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/RuleDialog.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** RuleDialog.py 19 Oct 2002 18:14:01 -0000 1.2 --- RuleDialog.py 19 Oct 2002 22:30:03 -0000 1.3 *************** *** 87,91 **** combo.AddString(s) if s == rule.action: sel_index = index ! index+=1 combo.SetCurSel(sel_index) return dialog.Dialog.OnInitDialog(self) --- 87,91 ---- combo.AddString(s) if s == rule.action: sel_index = index ! index += 1 combo.SetCurSel(sel_index) return dialog.Dialog.OnInitDialog(self) From tim_one@users.sourceforge.net Sat Oct 19 23:39:44 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 15:39:44 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv17127 Modified Files: manager.py Log Message: Folded the very long lines. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** manager.py 19 Oct 2002 22:30:02 -0000 1.4 --- manager.py 19 Oct 2002 22:39:41 -0000 1.5 *************** *** 13,17 **** # Suck in CDO type lib ! win32com.client.gencache.EnsureModule('{3FA7DEA7-6438-101B-ACC1-00AA00423326}', 0, 1, 21, bForDemand = True) try: --- 13,18 ---- # Suck in CDO type lib ! win32com.client.gencache.EnsureModule('{3FA7DEA7-6438-101B-ACC1-00AA00423326}', ! 0, 1, 21, bForDemand=True) try: *************** *** 27,31 **** self.verbose = verbose if not os.path.isabs(config_base): ! config_base = os.path.join( os.path.dirname(this_filename), config_base) config_base = os.path.abspath(config_base) self.ini_filename = config_base + "_bayes_customize.ini" --- 28,33 ---- self.verbose = verbose if not os.path.isabs(config_base): ! config_base = os.path.join(os.path.dirname(this_filename), ! config_base) config_base = os.path.abspath(config_base) self.ini_filename = config_base + "_bayes_customize.ini" *************** *** 52,61 **** def WorkerThreadEnding(self): ! assert self._tls.has_key(thread.get_ident()), "WorkerThreadStarting hasn't been called for this thread" del self._tls[thread.get_ident()] pythoncom.CoUninitialize() def GetOutlookForCurrentThread(self): ! assert self._tls.has_key(thread.get_ident()), "WorkerThreadStarting hasn't been called for this thread" existing = self._tls[thread.get_ident()].get("outlook") if not existing: --- 54,65 ---- def WorkerThreadEnding(self): ! assert self._tls.has_key(thread.get_ident()), \ ! "WorkerThreadStarting hasn't been called for this thread" del self._tls[thread.get_ident()] pythoncom.CoUninitialize() def GetOutlookForCurrentThread(self): ! assert self._tls.has_key(thread.get_ident()), \ ! "WorkerThreadStarting hasn't been called for this thread" existing = self._tls[thread.get_ident()].get("outlook") if not existing: *************** *** 66,70 **** def LoadBayes(self): if not os.path.exists(self.ini_filename): ! raise ManagerError("The file '%s' must exist before the database '%s' can be opened or created" % (self.ini_filename, self.bayes_filename)) bayes = None try: --- 70,76 ---- def LoadBayes(self): if not os.path.exists(self.ini_filename): ! raise ManagerError("The file '%s' must exist before the " ! "database '%s' can be opened or created" % ( ! self.ini_filename, self.bayes_filename)) bayes = None try: *************** *** 81,85 **** bayes = self.bayes if self.verbose: ! print "Bayes database initialized with %d spam and %d good messages" % (bayes.nspam, bayes.nham) self.bayes = bayes self.bayes_dirty = False --- 87,92 ---- bayes = self.bayes if self.verbose: ! print ("Bayes database initialized with " ! "%d spam and %d good messages" % (bayes.nspam, bayes.nham)) self.bayes = bayes self.bayes_dirty = False *************** *** 89,98 **** ret = cPickle.load(open(self.config_filename,'rb')) if self.verbose > 1: ! print "Loaded configuration from '%s':" % (self.config_filename,) ret._dump() except (AttributeError, ImportError): ret = _ConfigurationRoot() if self.verbose > 1: ! print "FAILED to load configuration from '%s - using default:" % (self.config_filename,) import traceback traceback.print_exc() --- 96,106 ---- ret = cPickle.load(open(self.config_filename,'rb')) if self.verbose > 1: ! print "Loaded configuration from '%s':" % self.config_filename ret._dump() except (AttributeError, ImportError): ret = _ConfigurationRoot() if self.verbose > 1: ! print ("FAILED to load configuration from '%s " ! "- using default:" % self.config_filename) import traceback traceback.print_exc() *************** *** 107,111 **** bayes = self.bayes if self.verbose: ! print "Saving bayes database with %d spam and %d good messages" % (bayes.nspam, bayes.nham) print " ->", self.bayes_filename cPickle.dump(bayes, open(self.bayes_filename,"wb"), 1) --- 115,120 ---- bayes = self.bayes if self.verbose: ! print ("Saving bayes database with %d spam and %d good messages" % ! (bayes.nspam, bayes.nham)) print " ->", self.bayes_filename cPickle.dump(bayes, open(self.bayes_filename,"wb"), 1) From tim_one@users.sourceforge.net Sat Oct 19 23:47:02 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 15:47:02 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv18953 Modified Files: manager.py Log Message: BayesManager.LoadConfig(): There's a bootstrapping gotcha here: the first time you run this, a default configuration .pck simply doesn't exist, and the program stops with IOError then. So added IOError to the list of exceptions this recognizes as "oops, better create one then!". Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** manager.py 19 Oct 2002 22:39:41 -0000 1.5 --- manager.py 19 Oct 2002 22:47:00 -0000 1.6 *************** *** 98,102 **** print "Loaded configuration from '%s':" % self.config_filename ret._dump() ! except (AttributeError, ImportError): ret = _ConfigurationRoot() if self.verbose > 1: --- 98,102 ---- print "Loaded configuration from '%s':" % self.config_filename ret._dump() ! except (AttributeError, ImportError, IOError): ret = _ConfigurationRoot() if self.verbose > 1: From tim_one@users.sourceforge.net Sat Oct 19 23:48:02 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 15:48:02 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv20252 Modified Files: manager.py Log Message: BayesManager.__init__: reverted my previous change of the verbose= fault from True to 1. I see now it's used as an int more than as a bool. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** manager.py 19 Oct 2002 22:47:00 -0000 1.6 --- manager.py 19 Oct 2002 22:47:59 -0000 1.7 *************** *** 25,29 **** class BayesManager: ! def __init__(self, config_base="default", outlook=None, verbose=True): self.verbose = verbose if not os.path.isabs(config_base): --- 25,29 ---- class BayesManager: ! def __init__(self, config_base="default", outlook=None, verbose=1): self.verbose = verbose if not os.path.isabs(config_base): From tim_one@users.sourceforge.net Sat Oct 19 23:57:08 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 15:57:08 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv23289 Modified Files: manager.py Log Message: LoadConfig(): Simplified the logic for creating a first-run config file. Arranged to explicitly close the pickle file if one already existed. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** manager.py 19 Oct 2002 22:47:59 -0000 1.7 --- manager.py 19 Oct 2002 22:57:06 -0000 1.8 *************** *** 94,102 **** def LoadConfig(self): try: ! ret = cPickle.load(open(self.config_filename,'rb')) if self.verbose > 1: print "Loaded configuration from '%s':" % self.config_filename ret._dump() ! except (AttributeError, ImportError, IOError): ret = _ConfigurationRoot() if self.verbose > 1: --- 94,110 ---- def LoadConfig(self): try: ! f = open(self.config_filename, 'rb') ! except IOError: ! if self.verbose: ! print ("Created new configuration file '%s'" % ! self.config_filename) ! return _ConfigurationRoot() ! ! try: ! ret = cPickle.load(f) if self.verbose > 1: print "Loaded configuration from '%s':" % self.config_filename ret._dump() ! except (AttributeError, ImportError): ret = _ConfigurationRoot() if self.verbose > 1: *************** *** 105,108 **** --- 113,117 ---- import traceback traceback.print_exc() + f.close() return ret From tim_one@users.sourceforge.net Sat Oct 19 23:58:01 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 15:58:01 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv23851 Modified Files: manager.py Log Message: LoadConfig(): Moved the pickle file-close up to the logical place. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** manager.py 19 Oct 2002 22:57:06 -0000 1.8 --- manager.py 19 Oct 2002 22:57:59 -0000 1.9 *************** *** 103,106 **** --- 103,107 ---- try: ret = cPickle.load(f) + f.close() if self.verbose > 1: print "Loaded configuration from '%s':" % self.config_filename *************** *** 113,117 **** import traceback traceback.print_exc() - f.close() return ret --- 114,117 ---- From tim_one@users.sourceforge.net Sun Oct 20 00:00:21 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 16:00:21 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 .cvsignore,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv25450 Added Files: .cvsignore Log Message: Try to get CVS to stop whining about .pyc and .pck etc files. --- NEW FILE: .cvsignore --- *.pyc *.pyo *.db *.pik *.zip build Data From tim_one@users.sourceforge.net Sun Oct 20 00:01:23 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 16:01:23 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 .cvsignore,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv26118 Modified Files: .cvsignore Log Message: Added .pck to the list of filetypes to ignore. Index: .cvsignore =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/.cvsignore,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** .cvsignore 19 Oct 2002 23:00:18 -0000 1.1 --- .cvsignore 19 Oct 2002 23:01:21 -0000 1.2 *************** *** 3,6 **** --- 3,7 ---- *.db *.pik + *.pck *.zip build From tim_one@users.sourceforge.net Sun Oct 20 00:02:17 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 16:02:17 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs .cvsignore,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv26476 Added Files: .cvsignore Log Message: Make CVS stop whining about generated filetypes. --- NEW FILE: .cvsignore --- *.pyc *.pyo *.db *.pik *.pck *.zip build Data From tim_one@users.sourceforge.net Sun Oct 20 05:01:10 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 21:01:10 -0700 Subject: [Spambayes-checkins] spambayes Tester.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv10765 Modified Files: Tester.py Log Message: false_positive_rate(), false_negative_rate(), unsure_rate(): return 0.0 instead of raising ZeroDivisionError if the base set is empty; e.g., if you *just* predict against ham in some test, there is no spam to count in the divisor for false_negative_rate(). Index: Tester.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Tester.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** Tester.py 17 Oct 2002 06:23:13 -0000 1.6 --- Tester.py 20 Oct 2002 04:01:08 -0000 1.7 *************** *** 118,130 **** def false_positive_rate(self): """Percentage of ham mistakenly identified as spam, in 0.0..100.0.""" ! return self.nham_wrong * 1e2 / self.nham_tested def false_negative_rate(self): """Percentage of spam mistakenly identified as ham, in 0.0..100.0.""" ! return self.nspam_wrong * 1e2 / self.nspam_tested def unsure_rate(self): return ((self.nham_unsure + self.nspam_unsure) * 1e2 / ! (self.nham_tested + self.nspam_tested)) def false_positives(self): --- 118,130 ---- def false_positive_rate(self): """Percentage of ham mistakenly identified as spam, in 0.0..100.0.""" ! return self.nham_wrong * 1e2 / (self.nham_tested or 1) def false_negative_rate(self): """Percentage of spam mistakenly identified as ham, in 0.0..100.0.""" ! return self.nspam_wrong * 1e2 / (self.nspam_tested or 1) def unsure_rate(self): return ((self.nham_unsure + self.nspam_unsure) * 1e2 / ! ((self.nham_tested + self.nspam_tested) or 1)) def false_positives(self): From tim_one@users.sourceforge.net Sun Oct 20 06:19:50 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 22:19:50 -0700 Subject: [Spambayes-checkins] spambayes TestDriver.py,1.26,1.27 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv29514 Modified Files: TestDriver.py Log Message: Added new Driver.set_classifier() method. This is useful if you want to run tests against a pre-existing classifier, as I've been doing in the background on fresh python.org data. Index: TestDriver.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** TestDriver.py 18 Oct 2002 05:44:05 -0000 1.26 --- TestDriver.py 20 Oct 2002 05:19:48 -0000 1.27 *************** *** 2,6 **** # Optional: # # Set up a new base classifier for testing. ! # new_classifier() # # Run tests against (possibly variants of) this classifier. # Loop: --- 2,6 ---- # Optional: # # Set up a new base classifier for testing. ! # new_classifier(), or set_classifier() # # Run tests against (possibly variants of) this classifier. # Loop: *************** *** 136,141 **** def new_classifier(self): ! c = self.classifier = classifier.Bayes() ! self.tester = Tester.Test(c) self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() --- 136,146 ---- def new_classifier(self): ! """Create and use a new, virgin classifier.""" ! self.set_classifier(classifier.Bayes()) ! ! def set_classifier(self, classifier): ! """Specify a classifier to be used for further testing.""" ! self.classifier = classifier ! self.tester = Tester.Test(classifier) self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() From tim_one@users.sourceforge.net Sun Oct 20 06:52:52 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 19 Oct 2002 22:52:52 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs ClassifyDialog.py,1.2,1.3 TrainingDialog.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv4367/dialogs Modified Files: ClassifyDialog.py TrainingDialog.py Log Message: It turns out that calling mapi.GetFolder() has the atrocious side effect of changing the working directory to the system MAPI directory on my box, and that screws up subsequent Python imports to no end. I'm bracketing the two GetFolder() calls that kill me all the time by getcwd/chdir pairs in this checkin, but there are more calls to this routine, and I don't like this code-duplication approach to the problem (it will just pop up again the next time someone adds a new GetFolder call). Mark? I don't yet have a good enough feel for the Grand Architecture to know how to do this right. Index: ClassifyDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/ClassifyDialog.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** ClassifyDialog.py 19 Oct 2002 18:14:01 -0000 1.2 --- ClassifyDialog.py 20 Oct 2002 05:52:50 -0000 1.3 *************** *** 1,2 **** --- 1,4 ---- + import os + from pywin.mfc import dialog import win32con *************** *** 63,66 **** --- 65,69 ---- def UpdateStatus(self): names = [] + cwd = os.getcwd() # mapi.GetFolder() switches to the system MAPI dir for eid in self.config.folder_ids: try: *************** *** 70,73 **** --- 73,77 ---- names.append(name) self.SetDlgItemText(IDC_STATIC_FOLDERS, "; ".join(names)) + os.chdir(cwd) def OnBrowse(self, id, code): Index: TrainingDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/TrainingDialog.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** TrainingDialog.py 19 Oct 2002 18:14:01 -0000 1.2 --- TrainingDialog.py 20 Oct 2002 05:52:50 -0000 1.3 *************** *** 1,2 **** --- 1,4 ---- + import os + from pywin.mfc import dialog import win32con *************** *** 63,66 **** --- 65,69 ---- def UpdateStatus(self): names = [] + cwd = os.getcwd() # mapi.GetFolder() switches to the system MAPI dir for eid in self.config.ham_folder_ids: try: *************** *** 79,82 **** --- 82,86 ---- names.append(name) self.SetDlgItemText(IDC_STATIC_SPAM, "; ".join(names)) + os.chdir(cwd) def OnBrowse(self, id, code): From mhammond@users.sourceforge.net Sun Oct 20 08:47:03 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 00:47:03 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs ManagerDialog.py,NONE,1.1 FilterDialog.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv30441/dialogs Modified Files: FilterDialog.py Added Files: ManagerDialog.py Log Message: Brand spanking new version that actually filters mail as they arrive (woo hooo) --- NEW FILE: ManagerDialog.py --- from pywin.mfc import dialog import win32con import commctrl import win32ui import win32api import pythoncom from DialogGlobals import * IDC_BUT_MOREINFO = 1024 IDC_BUT_DB = 1025 IDC_BUT_TRAIN = 1026 IDC_DB_STATUS = 1027 IDC_BUT_ENABLE_FILTER = 1028 IDC_BUT_FILTER = 1029 IDC_FILTER_STATUS = 1030 IDC_BUT_CLASSIFY = 1031 class ManagerDialog(dialog.Dialog): style = win32con.DS_MODALFRAME | win32con.WS_POPUP | win32con.WS_VISIBLE | win32con.WS_CAPTION | win32con.WS_SYSMENU | win32con.DS_SETFONT cs = win32con.WS_CHILD | win32con.WS_VISIBLE csts = cs | win32con.WS_TABSTOP filter_msg = "Filter the following folders as messages arrive" intro_msg = "This application filters out spam by continually learning the characteristics of email you recieve and filtering spam from your regular email. The system must be trained before it will be effective." training_intro = "Training is the process of giving examples of both good and bad email to the system so it can classify future email" filtering_intro = "Filtering is the process of deleting, moving or otherwise modifying messages based on their spam probability" classify_intro = "Classification is the process of adding properties to messages based on their Spam probability. Creating a property with the spam rating allows you to select the field using the Outlook Field Chooser." dt = [ # Dialog itself. ["Anti-Spam", (0, 0, 242, 277), style, None, (8, "MS Sans Serif")], # Children [STATIC, intro_msg, -1, ( 7, 7, 228, 25), cs], [BUTTON, 'Details...', IDC_BUT_MOREINFO, (168, 33, 62, 14), csts | win32con.BS_PUSHBUTTON], [BUTTON, "Database and Training", -1, ( 7, 49, 228, 62), cs | win32con.BS_GROUPBOX], [STATIC, training_intro, -1, ( 15, 57, 215, 17), cs], [BUTTON, 'Database Options', IDC_BUT_DB, ( 15, 77, 62, 14), csts | win32con.BS_PUSHBUTTON | win32con.WS_DISABLED], [BUTTON, '&Training', IDC_BUT_TRAIN, (168, 77, 62, 14), csts | win32con.BS_PUSHBUTTON], [STATIC, "", IDC_DB_STATUS, ( 15, 95, 215, 12), cs | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE | win32con.SS_SUNKEN], [BUTTON, "Filtering", -1, ( 7, 116, 228, 68), cs | win32con.BS_GROUPBOX], [STATIC, filtering_intro, -1, ( 15, 127, 215, 17), cs], [BUTTON, 'Enable &filtering', IDC_BUT_ENABLE_FILTER,(24, 147, 131, 11), csts | win32con.BS_AUTOCHECKBOX], [BUTTON, 'Define filters...', IDC_BUT_FILTER, (168, 144, 62, 14), csts | win32con.BS_PUSHBUTTON], [STATIC, "", IDC_FILTER_STATUS, ( 15, 162, 215, 12), cs | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE | win32con.SS_SUNKEN], [BUTTON, "Classification", -1, ( 7, 188, 228, 61), cs | win32con.BS_GROUPBOX], [STATIC, classify_intro, -1, ( 15, 201, 215, 26), cs], [BUTTON, 'Classify...', IDC_BUT_CLASSIFY, (168, 228, 62, 14), csts | win32con.BS_PUSHBUTTON], [BUTTON, 'Close', win32con.IDOK, (168, 256, 62, 14), csts | win32con.BS_DEFPUSHBUTTON], ] def __init__(self, mgr, do_train, do_filter, do_classify): self.mgr = mgr self.do_train = do_train self.do_filter = do_filter self.do_classify = do_classify dialog.Dialog.__init__(self, self.dt) def OnInitDialog(self): self.HookCommand(self.OnButMoreInfo, IDC_BUT_MOREINFO) self.HookCommand(self.OnButDoSomething, IDC_BUT_TRAIN) self.HookCommand(self.OnButDoSomething, IDC_BUT_FILTER) self.HookCommand(self.OnButDoSomething, IDC_BUT_CLASSIFY) self.HookCommand(self.OnButEnableFilter, IDC_BUT_ENABLE_FILTER) self.UpdateControlStatus() return dialog.Dialog.OnInitDialog(self) def UpdateControlStatus(self): nspam = self.mgr.bayes.nspam nham = self.mgr.bayes.nham enable_buttons = nspam > 0 and nham > 0 if enable_buttons: db_status = "Database has %d good and %d spam messages" % (nham, nspam) else: db_status = "Database must be trained before use" for id in [IDC_BUT_FILTER, IDC_BUT_CLASSIFY, IDC_BUT_ENABLE_FILTER]: self.GetDlgItem(id).EnableWindow(enable_buttons) self.SetDlgItemText(IDC_DB_STATUS, db_status) if not enable_buttons: self.mgr.config.filter.enabled = False self.GetDlgItem(IDC_BUT_ENABLE_FILTER).SetCheck(0) return # Build a filter-status string self.GetDlgItem(IDC_BUT_ENABLE_FILTER).SetCheck(self.mgr.config.filter.enabled) names = [] for eid in self.mgr.config.filter.folder_ids: names.append(self.mgr.mapi.GetFolder(eid).Name.encode("ascii", "replace")) # count enabled rules num = len([r for r in self.mgr.config.rules if r.enabled ]) if num == 0: num_rules_text = " with no active rules" elif num == 1: num_rules_text = " with 1 active rule" else: num_rules_text = " with %d active rules" % (num,) if not names: status = "No folders are being filtered" elif len(names) == 1: status = "Filtering %s%s." % (names[0], num_rules_text) elif len(names) == 2: status = "Filtering %s;%s%s." % (names[0], names[1], num_rules_text) else: status = "Filtering %d folders%s." % (len(names), num_rules_text) self.SetDlgItemText(IDC_FILTER_STATUS, status) def OnButMoreInfo(self, id, code): if code == win32con.BN_CLICKED: self.MessageBox("Contributions of HTML code to display here would be welcome :)") def OnButDoSomething(self, id, code): if code == win32con.BN_CLICKED: if id == IDC_BUT_TRAIN: doer = self.do_train elif id == IDC_BUT_CLASSIFY: doer = self.do_classify elif id == IDC_BUT_FILTER: doer = self.do_filter else: raise RuntimeError, "Unknown button ID!" doer(self) self.UpdateControlStatus() def OnButEnableFilter(self, id, code): if code == win32con.BN_CLICKED: self.mgr.config.filter.enabled = self.GetDlgItem(IDC_BUT_ENABLE_FILTER).GetCheck()==1 def OnOK(self): return dialog.Dialog.OnOK(self) if __name__=='__main__': def doer(dlg): print "doing something" class Generic: pass mgr = Generic() mgr.bayes = Generic() mgr.bayes.nham = 20 mgr.bayes.nspam = 7 d = ManagerDialog(mgr, doer, doer, doer) d.DoModal() Index: FilterDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FilterDialog.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** FilterDialog.py 19 Oct 2002 18:14:01 -0000 1.2 --- FilterDialog.py 20 Oct 2002 07:47:01 -0000 1.3 *************** *** 13,17 **** class RuleList: ! def __init__(self, parent, idc, rules, rule_factory, idc_add = None, idc_remove = None, idc_edit = None): self.parent = parent self.list = parent.GetDlgItem(idc) --- 13,19 ---- class RuleList: ! def __init__(self, parent, idc, rules, rule_factory, ! idc_add = None, idc_copy = None, idc_edit = None, idc_remove = None, ! idc_moveup = None, idc_movedown = None): self.parent = parent self.list = parent.GetDlgItem(idc) *************** *** 27,45 **** parent.HookNotify(self.OnTreeItemDoubleClick, commctrl.NM_DBLCLK) ! if idc_add is None: self.butAdd = None ! else: ! parent.HookCommand(self.OnButAdd, idc_add) ! self.butAdd = parent.GetDlgItem(idc_add) ! if idc_remove is None: self.butRemove = None ! else: ! parent.HookCommand(self.OnButRemove, idc_remove) ! self.butRemove = parent.GetDlgItem(idc_remove) ! if idc_edit is None: self.butEdit = None else: ! parent.HookCommand(self.OnButEdit, idc_edit) ! self.butEdit = parent.GetDlgItem(idc_edit) ! ! self.Refresh() def PushEnabledStates(self): --- 29,46 ---- parent.HookNotify(self.OnTreeItemDoubleClick, commctrl.NM_DBLCLK) ! self._HookButton(idc_add, "butAdd", self.OnButAdd) ! self._HookButton(idc_copy, "butCopy", self.OnButCopy) ! self._HookButton(idc_edit, "butEdit", self.OnButEdit) ! self._HookButton(idc_remove, "butRemove", self.OnButRemove) ! self._HookButton(idc_moveup, "butMoveUp", self.OnButMoveUp) ! self._HookButton(idc_movedown, "butMoveDown", self.OnButMoveDown) ! self.Refresh() ! def _HookButton(self, idc, attr, func): ! if idc is None: ! setattr(self, attr, None) else: ! self.parent.HookCommand(func, idc) ! setattr(self, attr, self.parent.GetDlgItem(idc)) def PushEnabledStates(self): *************** *** 112,115 **** --- 113,126 ---- if self.butRemove is not None: self.butRemove.EnableWindow(itemNew != 0) if self.butEdit is not None: self.butEdit.EnableWindow(itemNew != 0) + if self.butCopy is not None: self.butCopy.EnableWindow(itemNew != 0) + if itemNew: + index = self.GetSelectedRuleIndex() + if self.butMoveUp is not None: + self.butMoveUp.EnableWindow(index > 0) + if self.butMoveDown is not None: + self.butMoveDown.EnableWindow(index < len(self.rules)-1) + else: + self.butMoveUp.EnableWindow(False) + self.butMoveDown.EnableWindow(False) return 1 *************** *** 149,152 **** --- 160,196 ---- self.Refresh() + def OnButCopy(self, id, code): + if code == win32con.BN_CLICKED: + self.SyncEnabledStates() + index = self.GetSelectedRuleIndex() + + rule = copy.copy(self.rules[index]) + rule.name = "Copy of " + rule.name + d = RuleDialog.RuleDialog(rule, self.parent.mgr) + if d.DoModal()==win32con.IDOK: + self.rules.append(rule) + self.Refresh(len(self.rules)-1) + + def OnButMoveUp(self, id, code): + if code == win32con.BN_CLICKED: + self.SyncEnabledStates() + index = self.GetSelectedRuleIndex() + assert index > 0, "Can't move index zero up!" + old = self.rules[index] + self.rules[index] = self.rules[index-1] + self.rules[index-1] = old + self.Refresh(index-1) + + def OnButMoveDown(self, id, code): + if code == win32con.BN_CLICKED: + self.SyncEnabledStates() + index = self.GetSelectedRuleIndex() + num = len(self.rules) + assert index < num-1, "Can't move last index down!" + old = self.rules[index] + self.rules[index] = self.rules[index+1] + self.rules[index+1] = old + self.Refresh(index+1) + IDC_FOLDER_NAMES=1024 IDC_BROWSE=1025 *************** *** 157,181 **** IDC_BUT_FILTERNOW=1030 IDC_BUT_UNREAD=1031 class FilterArrivalsDialog(dialog.Dialog): style = win32con.DS_MODALFRAME | win32con.WS_POPUP | win32con.WS_VISIBLE | win32con.WS_CAPTION | win32con.WS_SYSMENU | win32con.DS_SETFONT cs = win32con.WS_CHILD | win32con.WS_VISIBLE ! treestyle = cs | win32con.WS_BORDER | commctrl.TVS_CHECKBOXES | commctrl.TVS_DISABLEDRAGDROP | commctrl.TVS_SHOWSELALWAYS filter_msg = "Filter the following folders as messages arrive" dt = [ # Dialog itself. ! ["Filters", (0, 0, 244, 198), style, None, (8, "MS Sans Serif")], # Children [STATIC, filter_msg, -1, ( 8, 9, 168, 11), cs], ! [STATIC, "", IDC_FOLDER_NAMES, ( 7, 20, 172, 12), cs | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE | win32con.SS_SUNKEN], ! [BUTTON, '&Browse', IDC_BROWSE, (187, 19, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], ! [BUTTON, "Enabled Rules", -1, ( 7, 40, 230, 130), cs | win32con.BS_GROUPBOX], ! [BUTTON, "&New...", IDC_BUT_NEW, ( 60, 151, 50, 14), cs | win32con.WS_TABSTOP], ! [BUTTON, "&Delete", IDC_BUT_DELETE, ( 119,151, 50, 14), cs | win32con.WS_TABSTOP | win32con.WS_DISABLED], ! [BUTTON, "&Edit...", IDC_BUT_EDIT, ( 179,151, 50, 14), cs | win32con.WS_TABSTOP | win32con.WS_DISABLED], ! ["SysTreeView32", None, IDC_LIST_RULES, ( 14, 52, 216, 95), treestyle | win32con.WS_TABSTOP], ! [BUTTON, '&Filter Now...', IDC_BUT_FILTERNOW, ( 7, 177, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], ! [BUTTON, 'Close', win32con.IDOK, (179, 177, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ] --- 201,235 ---- IDC_BUT_FILTERNOW=1030 IDC_BUT_UNREAD=1031 + IDC_BUT_COPY=1032 + IDC_BUT_MOVEUP=1033 + IDC_BUT_MOVEDOWN=1034 + class FilterArrivalsDialog(dialog.Dialog): style = win32con.DS_MODALFRAME | win32con.WS_POPUP | win32con.WS_VISIBLE | win32con.WS_CAPTION | win32con.WS_SYSMENU | win32con.DS_SETFONT cs = win32con.WS_CHILD | win32con.WS_VISIBLE ! csts = cs | win32con.WS_TABSTOP ! treestyle = csts | win32con.WS_BORDER | commctrl.TVS_CHECKBOXES | commctrl.TVS_DISABLEDRAGDROP | commctrl.TVS_SHOWSELALWAYS filter_msg = "Filter the following folders as messages arrive" dt = [ # Dialog itself. ! ["Filters", (0, 0, 249, 195), style, None, (8, "MS Sans Serif")], # Children [STATIC, filter_msg, -1, ( 8, 9, 168, 11), cs], ! [STATIC, "", IDC_FOLDER_NAMES, ( 7, 20, 175, 12), cs | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE | win32con.SS_SUNKEN], ! [BUTTON, '&Browse', IDC_BROWSE, (190, 19, 50, 14), csts | win32con.BS_PUSHBUTTON], ! [BUTTON, "Enabled Rules", -1, ( 7, 40, 237, 130), cs | win32con.BS_GROUPBOX], ! ["SysTreeView32", None, IDC_LIST_RULES, ( 18, 52, 164, 95), treestyle], ! ! [BUTTON, "&New...", IDC_BUT_NEW, (190, 52, 50, 14), csts ], ! [BUTTON, "&Copy..", IDC_BUT_COPY, (190, 72, 50, 14), csts ], ! [BUTTON, "&Modify...", IDC_BUT_EDIT, (190, 92, 50, 14), csts | win32con.WS_DISABLED], ! [BUTTON, "&Delete", IDC_BUT_DELETE, (190, 112, 50, 14), csts | win32con.WS_DISABLED], ! [BUTTON, "Move &Up", IDC_BUT_MOVEUP, ( 15, 150, 73, 14), csts | win32con.WS_DISABLED], ! [BUTTON, "Move &Down", IDC_BUT_MOVEDOWN, (109, 150, 73, 14), csts | win32con.WS_DISABLED], ! ! [BUTTON, '&Filter Now...', IDC_BUT_FILTERNOW, ( 15, 175, 50, 14), csts | win32con.BS_PUSHBUTTON], ! [BUTTON, 'Close', win32con.IDOK, (190, 175, 50, 14), csts | win32con.BS_DEFPUSHBUTTON], ] *************** *** 187,191 **** def OnInitDialog(self): ! self.list = RuleList(self, IDC_LIST_RULES, self.mgr.config.rules, self.rule_factory, IDC_BUT_NEW, IDC_BUT_DELETE, IDC_BUT_EDIT) self.HookCommand(self.OnButBrowse, IDC_BROWSE) self.HookCommand(self.OnButFilterNow, IDC_BUT_FILTERNOW) --- 241,245 ---- def OnInitDialog(self): ! self.list = RuleList(self, IDC_LIST_RULES, self.mgr.config.rules, self.rule_factory, IDC_BUT_NEW, IDC_BUT_COPY, IDC_BUT_EDIT, IDC_BUT_DELETE, IDC_BUT_MOVEUP, IDC_BUT_MOVEDOWN) self.HookCommand(self.OnButBrowse, IDC_BROWSE) self.HookCommand(self.OnButFilterNow, IDC_BUT_FILTERNOW) *************** *** 194,197 **** --- 248,252 ---- def OnOK(self): + self.list.SyncEnabledStates() return dialog.Dialog.OnOK(self) *************** *** 299,302 **** --- 354,358 ---- def StartProcess(self): + self.list.SyncEnabledStates() return AsyncDialogBase.StartProcess(self) From mhammond@users.sourceforge.net Sun Oct 20 08:47:03 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 00:47:03 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,NONE,1.1 config.py,NONE,1.1 classify.py,1.3,1.4 filter.py,1.4,1.5 manager.py,1.9,1.10rule.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv30441 Modified Files: classify.py filter.py manager.py rule.py Added Files: addin.py config.py Log Message: Brand spanking new version that actually filters mail as they arrive (woo hooo) --- NEW FILE: addin.py --- # Mark's Outlook addin import warnings warnings.filterwarnings("ignore", category=FutureWarning, append=1) # sick off the new hex() warnings! import sys from win32com import universal from win32com.server.exception import COMException from win32com.client import gencache, DispatchWithEvents, Dispatch import winerror import win32api import pythoncom from win32com.client import constants # If we are not running in a console, redirect all print statements to the # win32traceutil collector. # You can view output either from Pythonwin's "Tools->Trace Collector Debugging Tool", # or simply run "win32traceutil.py" from a command prompt. try: win32api.GetConsoleTitle() except win32api.error: # No console - redirect import win32traceutil print "Outlook Spam Addin module loading" # A lovely big block that attempts to catch the most common errors - COM objects not installed. try: # Support for COM objects we use. gencache.EnsureModule('{00062FFF-0000-0000-C000-000000000046}', 0, 9, 0, bForDemand=True) # Outlook 9 gencache.EnsureModule('{2DF8D04C-5BFA-101B-BDE5-00AA0044DE52}', 0, 2, 1, bForDemand=True) # Office 9 gencache.EnsureModule('{3FA7DEA7-6438-101B-ACC1-00AA00423326}', 0, 1, 21, bForDemand = True) # CDO # The TLB defiining the interfaces we implement universal.RegisterInterfaces('{AC0714F2-3D04-11D1-AE7D-00A0C90F26F4}', 0, 1, 0, ["_IDTExtensibility2"]) except pythoncom.com_error, (hr, msg, exc, arg): if __name__ != '__main__': # Error when not running as a script - eeek - just let it go. raise try: pythoncom.MakeIID("MAPI.Session") have_cdo = True except pythoncom.com_error: have_cdo = False print "This Addin requires that Outlook 2000 with CDO be installed on this machine." print if have_cdo: print "However, these appear to be installed. Error details:" print "COM Error 0x%x (%s)" % (hr, msg) if exc: print "Exception: %s" % (exc) print print "Sorry, I can't be more help, but I can't continue while I have this error." else: print "CDO is not currently installed. To install CDO, you must locate the" print "media from which you installed Outlook (such as Office 2000 CD or " print "sharepoint), re-run setup, select Outlook, enable CDO." print print "Please install CDO then attempt this registration again." sys.exit(1) # Whew - we seem to have all the COM support we need - let's rock! class ButtonEvent: def Init(self, handler, args = ()): self.handler = handler self.args = args def OnClick(self, button, cancel): self.handler(*self.args) class FolderItemsEvent: def Init(self, target, application, manager): self.application = application self.manager = manager self.target = target def OnItemAdd(self, item): if self.manager.config.filter.enabled: mapi_message = self.manager.mapi.GetMessage(item.EntryID) import filter num_rules = filter.filter_message(mapi_message, self.manager) print "%d Spam rules fired for message '%s'" % (num_rules, item.Subject.encode("ascii", "replace")) else: print "Spam filtering is disabled - ignoring new message" class OutlookAddin: _com_interfaces_ = ['_IDTExtensibility2'] _public_methods_ = [] _reg_clsctx_ = pythoncom.CLSCTX_INPROC_SERVER _reg_clsid_ = "{3556EDEE-FC91-4cf2-A0E4-7489747BAB10}" _reg_progid_ = "SpamBayes.OutlookAddin" _reg_policy_spec_ = "win32com.server.policy.EventHandlerPolicy" def __init__(self): self.folder_hooks = {} self.application = None def OnConnection(self, application, connectMode, addin, custom): print "SpamAddin - Connecting to Outlook" self.application = application # Create our bayes manager import manager self.manager = manager.GetManager() # ActiveExplorer may be none when started without a UI (eg, WinCE synchronisation) activeExplorer = application.ActiveExplorer() if activeExplorer is not None: bars = activeExplorer.CommandBars toolbar = bars.Item("Standard") item = toolbar.Controls.Add(Type=constants.msoControlButton, Temporary=True) # Hook events for the item item = self.toolbarButton = DispatchWithEvents(item, ButtonEvent) item.Init(manager.ShowManager, (self.manager,)) item.Caption="Anti-Spam" item.TooltipText = "Define anti-spam filters" item.Enabled = True # Create a notification hook for all folders we filter. self.UpdateFolderHooks() def UpdateFolderHooks(self): new_hooks = {} for mapi_folder in self.manager.BuildFolderList(self.manager.config.filter.folder_ids, self.manager.config.filter.include_sub): eid = mapi_folder.ID existing = self.folder_hooks.get(eid) if existing is None: folder = self.application.GetNamespace("MAPI").GetFolderFromID(eid) try: new_hook = DispatchWithEvents(folder.Items, FolderItemsEvent) except ValueError: print "WARNING: Folder '%s' can not hook events" % (folder.Name,) new_hook = None if new_hook is not None: new_hook.Init(folder, self.application, self.manager) new_hooks[eid] = new_hook print "Created new message hook for", folder.Name else: new_hooks[eid] = existing self.folder_hooks = new_hooks def OnDisconnection(self, mode, custom): print "SpamAddin - Disconnecting from Outlook" self.folder_hooks = None self.application = None if self.manager is not None: self.manager.Save() self.manager.Close() self.manager = None def OnAddInsUpdate(self, custom): print "SpamAddin - OnAddInsUpdate", custom def OnStartupComplete(self, custom): print "SpamAddin - OnStartupComplete", custom def OnBeginShutdown(self, custom): print "SpamAddin - OnBeginShutdown", custom def RegisterAddin(klass): import _winreg key = _winreg.CreateKey(_winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Office\\Outlook\\Addins") subkey = _winreg.CreateKey(key, klass._reg_progid_) _winreg.SetValueEx(subkey, "CommandLineSafe", 0, _winreg.REG_DWORD, 0) _winreg.SetValueEx(subkey, "LoadBehavior", 0, _winreg.REG_DWORD, 3) _winreg.SetValueEx(subkey, "Description", 0, _winreg.REG_SZ, klass._reg_progid_) _winreg.SetValueEx(subkey, "FriendlyName", 0, _winreg.REG_SZ, klass._reg_progid_) def UnregisterAddin(klass): import _winreg try: _winreg.DeleteKey(_winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Office\\Outlook\\Addins\\" + klass._reg_progid_) except WindowsError: pass if __name__ == '__main__': import win32com.server.register win32com.server.register.UseCommandLine(OutlookAddin) if "--unregister" in sys.argv: UnregisterAddin(OutlookAddin) else: RegisterAddin(OutlookAddin) --- NEW FILE: config.py --- # configuration stuff we persist via a pickle # Can't be defined in any module that may be used as "__main__" # or as a module. import pprint class _ConfigurationContainer: def __init__(self, **kw): self.__dict__.update(kw) # Crap state-loading code so when we load an early version of the pickle # any attributes in the new version are considered defaults. # XXX - I really really want a better scheme than pickles etc here :( def _update_from(self, dict): for name, val in dict.items(): updater = getattr(val, "_update_from", None) if updater is not None and self.__dict__.has_key(name): self.__dict__[name]._update_from(val.__dict__) else: self.__dict__[name] = val def __setstate__(self, state): self.__init__() # ensure any new/default values setup self._update_from(state) def _dump(self, thisname="", level=0): import pprint prefix = " " * level print "%s%s:" % (prefix, thisname) for name, ob in self.__dict__.items(): d = getattr(ob, "_dump", None) if d is None: print "%s %s: %s" % (prefix, name, pprint.pformat(ob)) else: d(name, level+1) class ConfigurationRoot(_ConfigurationContainer): def __init__(self): self.training = _ConfigurationContainer( ham_folder_ids = [], ham_include_sub = False, spam_folder_ids = [], spam_include_sub = False, ) self.classify = _ConfigurationContainer( folder_ids = [], include_sub = False, field_name = "SpamProb", ) self.filter = _ConfigurationContainer( folder_ids = [], include_sub = False, enabled = False, ) self.filter_now = _ConfigurationContainer( folder_ids = [], include_sub = False, only_unread = False, ) self.rules = [] if __name__=='__main__': print "Please run 'manager.py'" Index: classify.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** classify.py 19 Oct 2002 22:30:02 -0000 1.3 --- classify.py 20 Oct 2002 07:47:00 -0000 1.4 *************** *** 13,17 **** def classify_folder( f, mgr, config, progress): ! hammie = Hammie(mgr.bayes) messages = f.Messages pythoncom.CoInitialize() # We are called on a different thread. --- 13,17 ---- def classify_folder( f, mgr, config, progress): ! hammie = mgr.hammie messages = f.Messages pythoncom.CoInitialize() # We are called on a different thread. *************** *** 30,35 **** body = message.Text.encode('ascii', 'replace') text = headers + body ! ! prob, clues = hammie.score(text, evidence=True) added_prop = False try: --- 30,34 ---- body = message.Text.encode('ascii', 'replace') text = headers + body ! prob = hammie.score(text, evidence=False) added_prop = False try: Index: filter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** filter.py 19 Oct 2002 22:30:02 -0000 1.4 --- filter.py 20 Oct 2002 07:47:00 -0000 1.5 *************** *** 9,18 **** import rule ! from hammie import Hammie def filter_folder(f, mgr, progress, filter): only_unread = filter.only_unread - hammie = Hammie(mgr.bayes) num_messages = 0 for message in mgr.YieldMessageList(f): if progress.stop_requested(): --- 9,39 ---- import rule ! def filter_message(message, mgr): ! try: ! headers = message.Fields[0x7D001E].Value ! headers = headers.encode('ascii', 'replace') ! body = message.Text.encode('ascii', 'replace') ! text = headers + body ! except pythoncom.com_error, d: ! print "Failed to get a message: %s" % (d,) ! return ! ! prob = mgr.hammie.score(text, evidence=False) ! num_rules = 0 ! for rule in mgr.config.rules: ! if rule.enabled: ! try: ! if rule.Act(mgr, message, prob): ! num_rules += 1 ! except: ! print "Rule failed!" ! import traceback ! traceback.print_exc() ! return num_rules def filter_folder(f, mgr, progress, filter): only_unread = filter.only_unread num_messages = 0 + hammie = mgr.hammie for message in mgr.YieldMessageList(f): if progress.stop_requested(): *************** *** 21,47 **** if only_unread and not message.Unread: continue ! ! try: ! headers = message.Fields[0x7D001E].Value ! headers = headers.encode('ascii', 'replace') ! body = message.Text.encode('ascii', 'replace') ! text = headers + body ! except pythoncom.com_error, d: ! progress.warning("Failed to get a message: %s" % (str(d),) ) ! continue ! ! prob, clues = hammie.score(text, evidence=True) ! did_this_message = False ! for rule in mgr.config.rules: ! if rule.enabled: ! try: ! if rule.Act(mgr, message, prob): ! did_this_message = True ! except: ! print "Rule failed!" ! import traceback ! traceback.print_exc() ! if did_this_message: num_messages += 1 return num_messages --- 42,48 ---- if only_unread and not message.Unread: continue ! if filter_message(message, mgr): num_messages += 1 + return num_messages Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** manager.py 19 Oct 2002 22:57:59 -0000 1.9 --- manager.py 20 Oct 2002 07:47:00 -0000 1.10 *************** *** 6,23 **** import thread - import classifier - from tokenizer import tokenize import win32com.client import win32com.client.gencache import pythoncom ! # Suck in CDO type lib ! win32com.client.gencache.EnsureModule('{3FA7DEA7-6438-101B-ACC1-00AA00423326}', ! 0, 1, 21, bForDemand=True) try: ! this_filename = __file__ except NameError: ! this_filename = sys.argv[0] class ManagerError(Exception): --- 6,36 ---- import thread import win32com.client import win32com.client.gencache import pythoncom ! import config try: ! this_filename = os.path.abspath(__file__) except NameError: ! this_filename = os.path.abspath(sys.argv[0]) ! ! # This is a little of a hack . We are generally in a child directory of the ! # bayes code. To help installation, we handle the fact that this may not be ! # on sys.path. ! try: ! import classifier ! except ImportError: ! parent = os.path.abspath(os.path.join(os.path.dirname(this_filename), "..")) ! sys.path.insert(0, parent) ! del parent ! import classifier ! ! import hammie ! ! # Suck in CDO type lib ! win32com.client.gencache.EnsureModule('{3FA7DEA7-6438-101B-ACC1-00AA00423326}', ! 0, 1, 21, bForDemand=True) class ManagerError(Exception): *************** *** 75,78 **** --- 88,92 ---- bayes = None try: + os.environ["BAYESCUSTOMIZE"]=self.ini_filename bayes = cPickle.load(open(self.bayes_filename,'rb')) print "Loaded bayes database from '%s'" % (self.bayes_filename,) *************** *** 90,93 **** --- 104,108 ---- "%d spam and %d good messages" % (bayes.nspam, bayes.nham)) self.bayes = bayes + self.hammie = hammie.Hammie(bayes) self.bayes_dirty = False *************** *** 108,117 **** ret._dump() except (AttributeError, ImportError): ! ret = _ConfigurationRoot() if self.verbose > 1: ! print ("FAILED to load configuration from '%s " ! "- using default:" % self.config_filename) ! import traceback ! traceback.print_exc() return ret --- 123,136 ---- ret._dump() except (AttributeError, ImportError): ! ret = config.ConfigurationRoot() ! print "FAILED to load configuration from '%s' - using default:" % (self.config_filename,) ! import traceback ! traceback.print_exc() ! except IOError, details: ! # File-not-found - less serious. ! ret = config.ConfigurationRoot() if self.verbose > 1: ! # filename included in exception! ! print "IOError loading configuration (%s) - using default:" % (details) return ret *************** *** 183,241 **** message = messages.GetNext() - # configuration stuff we persist. - class _ConfigurationContainer: - def __init__(self, **kw): - self.__dict__.update(kw) - def __setstate__(self, state): - self.__init__() # ensure any new/default values setup - self.__dict__.update(state) - def _dump(self, thisname="", level=0): - import pprint - prefix = " " * level - print "%s%s:" % (prefix, thisname) - for name, ob in self.__dict__.items(): - d = getattr(ob, "_dump", None) - if d is None: - print "%s %s: %s" % (prefix, name, pprint.pformat(ob)) - else: - d(name, level+1) - - class _ConfigurationRoot(_ConfigurationContainer): - def __init__(self): - self.training = _ConfigurationContainer( - ham_folder_ids = [], - ham_include_sub = False, - spam_folder_ids = [], - spam_include_sub = False, - ) - self.classify = _ConfigurationContainer( - folder_ids = [], - include_sub = False, - field_name = "SpamProb", - ) - self.filter = _ConfigurationContainer( - folder_ids = [], - include_sub = False, - ) - self.filter_now = _ConfigurationContainer( - folder_ids = [], - include_sub = False, - only_unread = False, - ) - self.rules = [] - - _mgr = None ! def GetManager(): global _mgr if _mgr is None: ! _mgr = BayesManager() return _mgr ! if __name__=='__main__': try: ! mgr = BayesManager() except ManagerError, d: print "Error initializing Bayes manager" print d --- 202,264 ---- message = messages.GetNext() _mgr = None ! def GetManager(verbose=1): global _mgr if _mgr is None: ! _mgr = BayesManager(verbose=verbose) ! # If requesting greater verbosity, honour it ! if verbose > _mgr.verbose: ! _mgr.verbose = verbose return _mgr ! def ShowManager(mgr): ! def do_train(dlg): ! import train ! import dialogs.TrainingDialog ! d = dialogs.TrainingDialog.TrainingDialog(dlg.mgr, train.trainer) ! d.DoModal() ! ! def do_classify(dlg): ! import classify ! import dialogs.ClassifyDialog ! d = dialogs.ClassifyDialog.ClassifyDialog(dlg.mgr, classify.classifier) ! d.DoModal() ! ! def do_filter(dlg): ! import filter, rule ! import dialogs.FilterDialog ! d = dialogs.FilterDialog.FilterArrivalsDialog(dlg.mgr, rule.Rule, filter.filterer) ! d.DoModal() ! ! import dialogs.ManagerDialog ! d = dialogs.ManagerDialog.ManagerDialog(mgr, do_train, do_filter, do_classify) ! d.DoModal() ! ! def main(verbose_level = 1): try: ! mgr = GetManager(verbose=verbose_level) except ManagerError, d: print "Error initializing Bayes manager" print d + return 1 + ShowManager(mgr) + mgr.Save() + mgr.Close() + + def usage(): + print "Usage: manager [-v ...]" + sys.exit(1) + + if __name__=='__main__': + verbose = 1 + import getopt + opts, args = getopt.getopt(sys.argv[1:], "v") + if args: + usage() + for opt, val in opts: + if opt=="-v": + verbose += 1 + else: + usage() + main(verbose) Index: rule.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/rule.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** rule.py 19 Oct 2002 18:14:01 -0000 1.2 --- rule.py 20 Oct 2002 07:47:00 -0000 1.3 *************** *** 3,6 **** --- 3,8 ---- import time + MAPI_E_NOT_FOUND = -2147221233 + class Rule: def __init__(self): *************** *** 44,57 **** return False # Do mods before we move. outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI") ! outlook_message = outlook_ns.GetItemFromID(msg.ID) if self.flag_message: outlook_message.FlagRequest = "Check Spam" outlook_message.FlagStatus = constants.olFlagMarked ! outlook_message.Save() ! if self.write_field: format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format) prop.Value = prob outlook_message.Save() --- 46,69 ---- return False # Do mods before we move. + dirty = False outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI") ! try: ! outlook_message = outlook_ns.GetItemFromID(msg.ID) ! except pythoncom.com_error, (hr, desc, exc, arg): ! if not exc or exc[5] != MAPI_E_NOT_FOUND: ! raise ! print "Warning: Can't open the message - it has probably been moved" ! return False ! if self.flag_message: outlook_message.FlagRequest = "Check Spam" outlook_message.FlagStatus = constants.olFlagMarked ! dirty = True ! if self.write_field: format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format) prop.Value = prob + dirty = True + if dirty: outlook_message.Save() *************** *** 61,65 **** outlook_message.Copy(outlook_ns.GetFolderFromID(self.folder_id)) elif self.action == "Move": - print "moving", self.flag_message outlook_message.Move(outlook_ns.GetFolderFromID(self.folder_id)) else: --- 73,76 ---- From mhammond@users.sourceforge.net Sun Oct 20 08:50:38 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 00:50:38 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 classify.py,1.4,1.5 train.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv31716 Modified Files: classify.py train.py Log Message: Adjust imports slightly so we don't need bayes on sys.path when we start. Index: classify.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** classify.py 20 Oct 2002 07:47:00 -0000 1.4 --- classify.py 20 Oct 2002 07:50:36 -0000 1.5 *************** *** 8,14 **** import pythoncom import win32con - import classifier - from tokenizer import tokenize - from hammie import createbayes, Hammie def classify_folder( f, mgr, config, progress): --- 8,11 ---- Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** train.py 19 Oct 2002 18:14:01 -0000 1.3 --- train.py 20 Oct 2002 07:50:36 -0000 1.4 *************** *** 9,16 **** import win32con - import classifier - from tokenizer import tokenize - def train_folder( f, isspam, mgr, progress): for message in mgr.YieldMessageList(f): if progress.stop_requested(): --- 9,14 ---- import win32con def train_folder( f, isspam, mgr, progress): + from tokenizer import tokenize for message in mgr.YieldMessageList(f): if progress.stop_requested(): From mhammond@users.sourceforge.net Sun Oct 20 09:28:01 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 01:28:01 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.1,1.2 manager.py,1.10,1.11 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv10411 Modified Files: addin.py manager.py Log Message: Ensure addin and respond dynamically to changes made in the GUI (ie, list of folders to watch changes) Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** addin.py 20 Oct 2002 07:47:00 -0000 1.1 --- addin.py 20 Oct 2002 08:27:58 -0000 1.2 *************** *** 73,76 **** --- 73,79 ---- class FolderItemsEvent: + def __del__(self): + print "Event dieing" + def Init(self, target, application, manager): self.application = application *************** *** 107,110 **** --- 110,115 ---- import manager self.manager = manager.GetManager() + assert self.manager.addin is None, "Should not already have an addin" + self.manager.addin = self # ActiveExplorer may be none when started without a UI (eg, WinCE synchronisation) *************** *** 121,127 **** item.Enabled = True # Create a notification hook for all folders we filter. self.UpdateFolderHooks() ! def UpdateFolderHooks(self): new_hooks = {} --- 126,135 ---- item.Enabled = True + self.FiltersChanged() + + def FiltersChanged(self): # Create a notification hook for all folders we filter. self.UpdateFolderHooks() ! def UpdateFolderHooks(self): new_hooks = {} *************** *** 139,145 **** new_hook.Init(folder, self.application, self.manager) new_hooks[eid] = new_hook ! print "Created new message hook for", folder.Name else: new_hooks[eid] = existing self.folder_hooks = new_hooks --- 147,156 ---- new_hook.Init(folder, self.application, self.manager) new_hooks[eid] = new_hook ! print "AntiSpam: Watching for new messages in folder", folder.Name else: new_hooks[eid] = existing + for k in self.folder_hooks.keys(): + if not new_hooks.has_key(k): + self.folder_hooks[k]._obj_.close() self.folder_hooks = new_hooks Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** manager.py 20 Oct 2002 07:47:00 -0000 1.10 --- manager.py 20 Oct 2002 08:27:58 -0000 1.11 *************** *** 39,42 **** --- 39,43 ---- class BayesManager: def __init__(self, config_base="default", outlook=None, verbose=1): + self.addin = None self.verbose = verbose if not os.path.isabs(config_base): *************** *** 231,234 **** --- 232,237 ---- d = dialogs.FilterDialog.FilterArrivalsDialog(dlg.mgr, rule.Rule, filter.filterer) d.DoModal() + if dlg.mgr.addin is not None: + dlg.mgr.addin.FiltersChanged() import dialogs.ManagerDialog From mhammond@users.sourceforge.net Sun Oct 20 09:42:40 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 01:42:40 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 README.txt,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv14495 Modified Files: README.txt Log Message: Information on how to use this baby. Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/README.txt,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** README.txt 19 Oct 2002 16:23:37 -0000 1.2 --- README.txt 20 Oct 2002 08:42:37 -0000 1.3 *************** *** 1,5 **** This directory contains tools for using the classifier with Microsoft Outlook 2000, courtesy of Sean True and Mark Hammond. Note that you need ! Python's win32com extensions (http://starship.python.net/crew/mhammond) ** NOTE ** - You also need CDO installed. This comes with Outlook 2k, but is --- 1,6 ---- This directory contains tools for using the classifier with Microsoft Outlook 2000, courtesy of Sean True and Mark Hammond. Note that you need ! Python's win32com extensions (http://starship.python.net/crew/mhammond) and ! to run the Outlook Addin you *must* have win32all-149 or later. ** NOTE ** - You also need CDO installed. This comes with Outlook 2k, but is *************** *** 8,17 **** compaining about "MAPI.Session", this is your problem. train.py Train a classifier from Outlook Mail folders. filter.py ! Moves and modifies msgs among Outlook Mail folders, based on classifier ! score. classify.py --- 9,62 ---- compaining about "MAPI.Session", this is your problem. + Outlook Addin + ========== + If you execute "addin.py", a Microsoft Outlook plugin will be installed. + Next time outlook is started, you should see a "Anti-Spam" button + on the toolbar. Clicking it will allow you to maintain your bayes database + and filters. + + All functionality in this package can be accessed from this plugin. This + directory contains a number of other files (see below) which can be + used to access features of the bayes database and filters from outside + of the Outlook environment. Either way, the functionality is the same. + + To see any output from the addin (eg, Python print statements) you can either + select "Tools->Trace Collector Debugging Tool" from inside Pythonwin, or just + execute win32traceutil.py (from the win32all extensions) from a Command + Prompt. + + NOTE: If the addin fails to load, Outlook will automatically disable it + for the next time Outlook starts. Re-executing 'addin.py' will ensure + the addin is enabled. + + Filtering + -------- + When running from Outlook, you can enable filtering for all mail that arrives + in your Inbox (or any other filter). Note that Outlook's builtin rules will + fire before this notification, and if these rules move the message, it will + never appear in the inbox (and thus will not get spam-filtered by a simple + Inbox filter) + + You can define any number of filters to apply, each performing a different + action or testing a different spam probability. You can enable and disable + any rules, and you can "bulk-apply" a filter to an entire folder. + + Note that the rule ordering can be important, as if early rules move + a message, later rules will not fire for that message. + + Command Line Tools + ------------------- + There are a number of scripts that invoke the same GUI as the + Outlook plugin. + + manager.py + Display the main dialog, which provides access to all other features. + train.py Train a classifier from Outlook Mail folders. filter.py ! Define filters, and allow a bulk-filter to be applied. (The outlook ! plugin must be running for filtering of new mail to occur) classify.py *************** *** 20,29 **** or used to change formatting of these messages. The field will appear in "user defined fields" - - Comments from Sean: - This code is extremely rudimentary. ! I am getting bad output saving very large classifiers in training.py. Somewhere over 4MB, they seem to stop working. --- 65,73 ---- or used to change formatting of these messages. The field will appear in "user defined fields" ! Misc Comments ! =========== ! Sean reports bad output saving very large classifiers in training.py. Somewhere over 4MB, they seem to stop working. From mhammond@users.sourceforge.net Sun Oct 20 12:58:23 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 04:58:23 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv18275 Modified Files: addin.py Log Message: Just for tim, an option that shows the clues for the selected message. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** addin.py 20 Oct 2002 08:27:58 -0000 1.2 --- addin.py 20 Oct 2002 11:58:20 -0000 1.3 *************** *** 62,65 **** --- 62,87 ---- sys.exit(1) + # Something that should be in win32com in some form or another. + def CastTo(ob, target): + """'Cast' a COM object to another type""" + if hasattr(target, "index"): # string like + # for now, we assume makepy for this to work. + if not ob.__class__.__dict__.get("CLSID"): # Eeek - no makepy support - try and build it. + ob = gencache.EnsureDispatch(ob) + if not ob.__class__.__dict__.get("CLSID"): + raise ValueError, "Must be a makepy-able object for this to work" + clsid = ob.CLSID + mod = gencache.GetModuleForCLSID(clsid) + # Get the 'root' module. + mod = gencache.GetModuleForTypelib(mod.CLSID, mod.LCID, mod.MajorVersion, mod.MinorVersion) + # Find the CLSID of the target + # XXX - should not be looking in VTables..., but no general map currently exists + target_clsid = mod.VTablesNamesToIIDMap.get(target) + mod = gencache.GetModuleForCLSID(target_clsid) + target_class = getattr(mod, target) + # resolve coclass to interface + target_class = getattr(target_class, "default_interface", target_class) + return target_class(ob) # auto QI magic happens + # Whew - we seem to have all the COM support we need - let's rock! *************** *** 90,93 **** --- 112,146 ---- print "Spam filtering is disabled - ignoring new message" + def ShowClues(mgr, app): + sel = app.ActiveExplorer().Selection + if sel.Count == 0: + win32ui.MessageBox("No items are selected", "No selection") + return + if sel.Count > 1: + win32ui.MessageBox("Please select a single item", "Large selection") + return + + item = sel.Item(1) + if item.Class != constants.olMail: + win32ui.MessageBox("This function can only be performed on mail items", "Not a mail message") + return + + mapi_message = mgr.mapi.GetMessage(item.EntryID) + headers = mapi_message.Fields[0x7D001E].Value + headers = headers.encode('ascii', 'replace') + body = mapi_message.Text.encode('ascii', 'replace') + text = headers + body + prob, clues = mgr.hammie.score(text, evidence=True) + + new_msg = app.CreateItem(0) + body = "

Calculated Probability: %.2f


" % (prob,) + body += "
" + mgr.hammie.formatclues(clues, "
") + "
" + new_msg.Subject = "Spam Clues: " + item.Subject + # Stupid outlook always switches to RTF :( Work-around + ## new_msg.Body = body + new_msg.HTMLBody = "" + body + "" + # Attach the source message to it + new_msg.Attachments.Add(item, constants.olEmbeddeditem, DisplayName="SPAM") + new_msg.Display() class OutlookAddin: *************** *** 102,105 **** --- 155,159 ---- self.folder_hooks = {} self.application = None + self.buttons = [] def OnConnection(self, application, connectMode, addin, custom): *************** *** 118,128 **** bars = activeExplorer.CommandBars toolbar = bars.Item("Standard") ! item = toolbar.Controls.Add(Type=constants.msoControlButton, Temporary=True) # Hook events for the item ! item = self.toolbarButton = DispatchWithEvents(item, ButtonEvent) item.Init(manager.ShowManager, (self.manager,)) ! item.Caption="Anti-Spam" item.TooltipText = "Define anti-spam filters" item.Enabled = True self.FiltersChanged() --- 172,198 ---- bars = activeExplorer.CommandBars toolbar = bars.Item("Standard") ! # Add a pop-up menu to the toolbar ! popup = toolbar.Controls.Add(Type=constants.msoControlPopup, Temporary=True) ! popup.Caption="Anti-Spam" ! popup.TooltipText = "Anti-Spam filters and functions" ! popup.Enabled = True ! popup = CastTo(popup, "CommandBarPopup") ! ! item = popup.Controls.Add(Type=constants.msoControlButton, Temporary=True) # Hook events for the item ! item = DispatchWithEvents(item, ButtonEvent) ! item.Init(ShowClues, (self.manager, application)) ! item.Caption="Show spam clues for current message" ! item.Enabled = True ! self.buttons.append(item) ! ! item = popup.Controls.Add(Type=constants.msoControlButton, Temporary=True) ! # Hook events for the item ! item = DispatchWithEvents(item, ButtonEvent) item.Init(manager.ShowManager, (self.manager,)) ! item.Caption="Options..." item.TooltipText = "Define anti-spam filters" item.Enabled = True + self.buttons.append(item) self.FiltersChanged() *************** *** 163,166 **** --- 233,237 ---- self.manager.Close() self.manager = None + self.buttons = None def OnAddInsUpdate(self, custom): *************** *** 170,173 **** --- 241,245 ---- def OnBeginShutdown(self, custom): print "SpamAddin - OnBeginShutdown", custom + def RegisterAddin(klass): From mhammond@users.sourceforge.net Sun Oct 20 13:20:54 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 05:20:54 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.3,1.4 classify.py,1.5,1.6 filter.py,1.5,1.6 manager.py,1.11,1.12 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv27914 Modified Files: addin.py classify.py filter.py manager.py Log Message: Oops - it appears we must refetch the hammie if bayes changes. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** addin.py 20 Oct 2002 11:58:20 -0000 1.3 --- addin.py 20 Oct 2002 12:20:52 -0000 1.4 *************** *** 131,139 **** body = mapi_message.Text.encode('ascii', 'replace') text = headers + body ! prob, clues = mgr.hammie.score(text, evidence=True) new_msg = app.CreateItem(0) body = "

Calculated Probability: %.2f


" % (prob,) ! body += "
" + mgr.hammie.formatclues(clues, "
") + "
" new_msg.Subject = "Spam Clues: " + item.Subject # Stupid outlook always switches to RTF :( Work-around --- 131,140 ---- body = mapi_message.Text.encode('ascii', 'replace') text = headers + body ! hammie = mgr.MakeHammie() ! prob, clues = hammie.score(text, evidence=True) new_msg = app.CreateItem(0) body = "

Calculated Probability: %.2f


" % (prob,) ! body += "
" + hammie.formatclues(clues, "
") + "
" new_msg.Subject = "Spam Clues: " + item.Subject # Stupid outlook always switches to RTF :( Work-around Index: classify.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** classify.py 20 Oct 2002 07:50:36 -0000 1.5 --- classify.py 20 Oct 2002 12:20:52 -0000 1.6 *************** *** 10,14 **** def classify_folder( f, mgr, config, progress): ! hammie = mgr.hammie messages = f.Messages pythoncom.CoInitialize() # We are called on a different thread. --- 10,14 ---- def classify_folder( f, mgr, config, progress): ! hammie = mgr.MakeHammie() messages = f.Messages pythoncom.CoInitialize() # We are called on a different thread. Index: filter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** filter.py 20 Oct 2002 07:47:00 -0000 1.5 --- filter.py 20 Oct 2002 12:20:52 -0000 1.6 *************** *** 19,23 **** return ! prob = mgr.hammie.score(text, evidence=False) num_rules = 0 for rule in mgr.config.rules: --- 19,24 ---- return ! hammie = mgr.MakeHammie() ! prob = hammie.score(text, evidence=False) num_rules = 0 for rule in mgr.config.rules: *************** *** 35,39 **** only_unread = filter.only_unread num_messages = 0 - hammie = mgr.hammie for message in mgr.YieldMessageList(f): if progress.stop_requested(): --- 36,39 ---- Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** manager.py 20 Oct 2002 08:27:58 -0000 1.11 --- manager.py 20 Oct 2002 12:20:52 -0000 1.12 *************** *** 105,110 **** "%d spam and %d good messages" % (bayes.nspam, bayes.nham)) self.bayes = bayes - self.hammie = hammie.Hammie(bayes) self.bayes_dirty = False def LoadConfig(self): --- 105,112 ---- "%d spam and %d good messages" % (bayes.nspam, bayes.nham)) self.bayes = bayes self.bayes_dirty = False + + def MakeHammie(self): + return hammie.Hammie(self.bayes) def LoadConfig(self): From tim_one@users.sourceforge.net Sun Oct 20 17:10:24 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 20 Oct 2002 09:10:24 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv8051 Modified Files: addin.py Log Message: FutureWarning doesn't exist before 2.3, so don't try to access it from Pythons older than that. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** addin.py 20 Oct 2002 12:20:52 -0000 1.4 --- addin.py 20 Oct 2002 16:10:22 -0000 1.5 *************** *** 1,9 **** # Mark's Outlook addin ! import warnings - warnings.filterwarnings("ignore", category=FutureWarning, append=1) # sick off the new hex() warnings! ! import sys from win32com import universal --- 1,10 ---- # Mark's Outlook addin ! import sys import warnings ! if sys.version_info >= (2, 3): ! # sick off the new hex() warnings! ! warnings.filterwarnings("ignore", category=FutureWarning, append=1) from win32com import universal *************** *** 52,56 **** if exc: print "Exception: %s" % (exc) ! print print "Sorry, I can't be more help, but I can't continue while I have this error." else: --- 53,57 ---- if exc: print "Exception: %s" % (exc) ! print print "Sorry, I can't be more help, but I can't continue while I have this error." else: *************** *** 133,137 **** hammie = mgr.MakeHammie() prob, clues = hammie.score(text, evidence=True) ! new_msg = app.CreateItem(0) body = "

Calculated Probability: %.2f


" % (prob,) --- 134,138 ---- hammie = mgr.MakeHammie() prob, clues = hammie.score(text, evidence=True) ! new_msg = app.CreateItem(0) body = "

Calculated Probability: %.2f


" % (prob,) *************** *** 167,171 **** assert self.manager.addin is None, "Should not already have an addin" self.manager.addin = self ! # ActiveExplorer may be none when started without a UI (eg, WinCE synchronisation) activeExplorer = application.ActiveExplorer() --- 168,172 ---- assert self.manager.addin is None, "Should not already have an addin" self.manager.addin = self ! # ActiveExplorer may be none when started without a UI (eg, WinCE synchronisation) activeExplorer = application.ActiveExplorer() *************** *** 179,183 **** popup.Enabled = True popup = CastTo(popup, "CommandBarPopup") ! item = popup.Controls.Add(Type=constants.msoControlButton, Temporary=True) # Hook events for the item --- 180,184 ---- popup.Enabled = True popup = CastTo(popup, "CommandBarPopup") ! item = popup.Controls.Add(Type=constants.msoControlButton, Temporary=True) # Hook events for the item *************** *** 202,206 **** # Create a notification hook for all folders we filter. self.UpdateFolderHooks() ! def UpdateFolderHooks(self): new_hooks = {} --- 203,207 ---- # Create a notification hook for all folders we filter. self.UpdateFolderHooks() ! def UpdateFolderHooks(self): new_hooks = {} *************** *** 225,229 **** self.folder_hooks[k]._obj_.close() self.folder_hooks = new_hooks ! def OnDisconnection(self, mode, custom): print "SpamAddin - Disconnecting from Outlook" --- 226,230 ---- self.folder_hooks[k]._obj_.close() self.folder_hooks = new_hooks ! def OnDisconnection(self, mode, custom): print "SpamAddin - Disconnecting from Outlook" From tim_one@users.sourceforge.net Sun Oct 20 19:49:55 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 20 Oct 2002 11:49:55 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv5122 Modified Files: addin.py Log Message: ShowClues(): For diagnostic purposes, and especially under chi-combining, two digits after the decimal point aren't enough. Switched to %g formats. Also arranged to line up the words and spamprobs in even columns. I tried using an HTML table for this, but there was way too much vertical space, and I didn't feel like fiddling endlessly with that. It's easier and works better to keep it in a PRE section. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** addin.py 20 Oct 2002 16:10:22 -0000 1.5 --- addin.py 20 Oct 2002 18:49:53 -0000 1.6 *************** *** 136,141 **** new_msg = app.CreateItem(0) ! body = "

Calculated Probability: %.2f


" % (prob,) ! body += "
" + hammie.formatclues(clues, "
") + "
" new_msg.Subject = "Spam Clues: " + item.Subject # Stupid outlook always switches to RTF :( Work-around --- 136,151 ---- new_msg = app.CreateItem(0) ! body = ["

Calculated Probability: %g


" % prob] ! push = body.append ! push("
\n")
!     words = ['%r' % word for word, prob in clues]
!     probs = ['%g' % prob for word, prob in clues]
!     max_word_len = max(map(len, words))
!     for word, prob in zip(words, probs):
!         push(word + ' ' * (max_word_len - len(word)))
!         push(' ' + prob + '\n')
!     push("
\n") ! body = ''.join(body) ! new_msg.Subject = "Spam Clues: " + item.Subject # Stupid outlook always switches to RTF :( Work-around From tim_one@users.sourceforge.net Sun Oct 20 19:53:09 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 20 Oct 2002 11:53:09 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs FilterDialog.py,1.3,1.4ManagerDialog.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv8818/dialogs Modified Files: FilterDialog.py ManagerDialog.py Log Message: Whitespace normalization. Index: FilterDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FilterDialog.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** FilterDialog.py 20 Oct 2002 07:47:01 -0000 1.3 --- FilterDialog.py 20 Oct 2002 18:53:06 -0000 1.4 *************** *** 164,168 **** self.SyncEnabledStates() index = self.GetSelectedRuleIndex() ! rule = copy.copy(self.rules[index]) rule.name = "Copy of " + rule.name --- 164,168 ---- self.SyncEnabledStates() index = self.GetSelectedRuleIndex() ! rule = copy.copy(self.rules[index]) rule.name = "Copy of " + rule.name *************** *** 221,225 **** [BUTTON, "Enabled Rules", -1, ( 7, 40, 237, 130), cs | win32con.BS_GROUPBOX], ["SysTreeView32", None, IDC_LIST_RULES, ( 18, 52, 164, 95), treestyle], ! [BUTTON, "&New...", IDC_BUT_NEW, (190, 52, 50, 14), csts ], [BUTTON, "&Copy..", IDC_BUT_COPY, (190, 72, 50, 14), csts ], --- 221,225 ---- [BUTTON, "Enabled Rules", -1, ( 7, 40, 237, 130), cs | win32con.BS_GROUPBOX], ["SysTreeView32", None, IDC_LIST_RULES, ( 18, 52, 164, 95), treestyle], ! [BUTTON, "&New...", IDC_BUT_NEW, (190, 52, 50, 14), csts ], [BUTTON, "&Copy..", IDC_BUT_COPY, (190, 72, 50, 14), csts ], *************** *** 229,233 **** [BUTTON, "Move &Up", IDC_BUT_MOVEUP, ( 15, 150, 73, 14), csts | win32con.WS_DISABLED], [BUTTON, "Move &Down", IDC_BUT_MOVEDOWN, (109, 150, 73, 14), csts | win32con.WS_DISABLED], ! [BUTTON, '&Filter Now...', IDC_BUT_FILTERNOW, ( 15, 175, 50, 14), csts | win32con.BS_PUSHBUTTON], [BUTTON, 'Close', win32con.IDOK, (190, 175, 50, 14), csts | win32con.BS_DEFPUSHBUTTON], --- 229,233 ---- [BUTTON, "Move &Up", IDC_BUT_MOVEUP, ( 15, 150, 73, 14), csts | win32con.WS_DISABLED], [BUTTON, "Move &Down", IDC_BUT_MOVEDOWN, (109, 150, 73, 14), csts | win32con.WS_DISABLED], ! [BUTTON, '&Filter Now...', IDC_BUT_FILTERNOW, ( 15, 175, 50, 14), csts | win32con.BS_PUSHBUTTON], [BUTTON, 'Close', win32con.IDOK, (190, 175, 50, 14), csts | win32con.BS_DEFPUSHBUTTON], Index: ManagerDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/ManagerDialog.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ManagerDialog.py 20 Oct 2002 07:47:01 -0000 1.1 --- ManagerDialog.py 20 Oct 2002 18:53:06 -0000 1.2 *************** *** 26,30 **** filtering_intro = "Filtering is the process of deleting, moving or otherwise modifying messages based on their spam probability" classify_intro = "Classification is the process of adding properties to messages based on their Spam probability. Creating a property with the spam rating allows you to select the field using the Outlook Field Chooser." ! dt = [ # Dialog itself. --- 26,30 ---- filtering_intro = "Filtering is the process of deleting, moving or otherwise modifying messages based on their spam probability" classify_intro = "Classification is the process of adding properties to messages based on their Spam probability. Creating a property with the spam rating allows you to select the field using the Outlook Field Chooser." ! dt = [ # Dialog itself. *************** *** 45,49 **** [BUTTON, 'Define filters...', IDC_BUT_FILTER, (168, 144, 62, 14), csts | win32con.BS_PUSHBUTTON], [STATIC, "", IDC_FILTER_STATUS, ( 15, 162, 215, 12), cs | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE | win32con.SS_SUNKEN], ! [BUTTON, "Classification", -1, ( 7, 188, 228, 61), cs | win32con.BS_GROUPBOX], [STATIC, classify_intro, -1, ( 15, 201, 215, 26), cs], --- 45,49 ---- [BUTTON, 'Define filters...', IDC_BUT_FILTER, (168, 144, 62, 14), csts | win32con.BS_PUSHBUTTON], [STATIC, "", IDC_FILTER_STATUS, ( 15, 162, 215, 12), cs | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE | win32con.SS_SUNKEN], ! [BUTTON, "Classification", -1, ( 7, 188, 228, 61), cs | win32con.BS_GROUPBOX], [STATIC, classify_intro, -1, ( 15, 201, 215, 26), cs], *************** *** 129,133 **** if code == win32con.BN_CLICKED: self.mgr.config.filter.enabled = self.GetDlgItem(IDC_BUT_ENABLE_FILTER).GetCheck()==1 ! def OnOK(self): return dialog.Dialog.OnOK(self) --- 129,133 ---- if code == win32con.BN_CLICKED: self.mgr.config.filter.enabled = self.GetDlgItem(IDC_BUT_ENABLE_FILTER).GetCheck()==1 ! def OnOK(self): return dialog.Dialog.OnOK(self) From tim_one@users.sourceforge.net Sun Oct 20 19:53:08 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 20 Oct 2002 11:53:08 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.6,1.7 config.py,1.1,1.2 filter.py,1.6,1.7 manager.py,1.12,1.13 rule.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv8818 Modified Files: addin.py config.py filter.py manager.py rule.py Log Message: Whitespace normalization. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** addin.py 20 Oct 2002 18:49:53 -0000 1.6 --- addin.py 20 Oct 2002 18:53:06 -0000 1.7 *************** *** 65,87 **** # Something that should be in win32com in some form or another. def CastTo(ob, target): ! """'Cast' a COM object to another type""" ! if hasattr(target, "index"): # string like # for now, we assume makepy for this to work. ! if not ob.__class__.__dict__.get("CLSID"): # Eeek - no makepy support - try and build it. ! ob = gencache.EnsureDispatch(ob) ! if not ob.__class__.__dict__.get("CLSID"): ! raise ValueError, "Must be a makepy-able object for this to work" ! clsid = ob.CLSID ! mod = gencache.GetModuleForCLSID(clsid) ! # Get the 'root' module. ! mod = gencache.GetModuleForTypelib(mod.CLSID, mod.LCID, mod.MajorVersion, mod.MinorVersion) ! # Find the CLSID of the target ! # XXX - should not be looking in VTables..., but no general map currently exists ! target_clsid = mod.VTablesNamesToIIDMap.get(target) ! mod = gencache.GetModuleForCLSID(target_clsid) ! target_class = getattr(mod, target) ! # resolve coclass to interface ! target_class = getattr(target_class, "default_interface", target_class) ! return target_class(ob) # auto QI magic happens # Whew - we seem to have all the COM support we need - let's rock! --- 65,87 ---- # Something that should be in win32com in some form or another. def CastTo(ob, target): ! """'Cast' a COM object to another type""" ! if hasattr(target, "index"): # string like # for now, we assume makepy for this to work. ! if not ob.__class__.__dict__.get("CLSID"): # Eeek - no makepy support - try and build it. ! ob = gencache.EnsureDispatch(ob) ! if not ob.__class__.__dict__.get("CLSID"): ! raise ValueError, "Must be a makepy-able object for this to work" ! clsid = ob.CLSID ! mod = gencache.GetModuleForCLSID(clsid) ! # Get the 'root' module. ! mod = gencache.GetModuleForTypelib(mod.CLSID, mod.LCID, mod.MajorVersion, mod.MinorVersion) ! # Find the CLSID of the target ! # XXX - should not be looking in VTables..., but no general map currently exists ! target_clsid = mod.VTablesNamesToIIDMap.get(target) ! mod = gencache.GetModuleForCLSID(target_clsid) ! target_class = getattr(mod, target) ! # resolve coclass to interface ! target_class = getattr(target_class, "default_interface", target_class) ! return target_class(ob) # auto QI magic happens # Whew - we seem to have all the COM support we need - let's rock! Index: config.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/config.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** config.py 20 Oct 2002 07:47:00 -0000 1.1 --- config.py 20 Oct 2002 18:53:06 -0000 1.2 *************** *** 33,37 **** else: d(name, level+1) ! class ConfigurationRoot(_ConfigurationContainer): def __init__(self): --- 33,37 ---- else: d(name, level+1) ! class ConfigurationRoot(_ConfigurationContainer): def __init__(self): Index: filter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** filter.py 20 Oct 2002 12:20:52 -0000 1.6 --- filter.py 20 Oct 2002 18:53:06 -0000 1.7 *************** *** 44,48 **** if filter_message(message, mgr): num_messages += 1 ! return num_messages --- 44,48 ---- if filter_message(message, mgr): num_messages += 1 ! return num_messages Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** manager.py 20 Oct 2002 12:20:52 -0000 1.12 --- manager.py 20 Oct 2002 18:53:06 -0000 1.13 *************** *** 222,226 **** d = dialogs.TrainingDialog.TrainingDialog(dlg.mgr, train.trainer) d.DoModal() ! def do_classify(dlg): import classify --- 222,226 ---- d = dialogs.TrainingDialog.TrainingDialog(dlg.mgr, train.trainer) d.DoModal() ! def do_classify(dlg): import classify *************** *** 236,240 **** if dlg.mgr.addin is not None: dlg.mgr.addin.FiltersChanged() ! import dialogs.ManagerDialog d = dialogs.ManagerDialog.ManagerDialog(mgr, do_train, do_filter, do_classify) --- 236,240 ---- if dlg.mgr.addin is not None: dlg.mgr.addin.FiltersChanged() ! import dialogs.ManagerDialog d = dialogs.ManagerDialog.ManagerDialog(mgr, do_train, do_filter, do_classify) *************** *** 255,259 **** print "Usage: manager [-v ...]" sys.exit(1) ! if __name__=='__main__': verbose = 1 --- 255,259 ---- print "Usage: manager [-v ...]" sys.exit(1) ! if __name__=='__main__': verbose = 1 Index: rule.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/rule.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** rule.py 20 Oct 2002 07:47:00 -0000 1.3 --- rule.py 20 Oct 2002 18:53:06 -0000 1.4 *************** *** 60,69 **** outlook_message.FlagStatus = constants.olFlagMarked dirty = True ! if self.write_field: format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format) prop.Value = prob dirty = True ! if dirty: outlook_message.Save() --- 60,69 ---- outlook_message.FlagStatus = constants.olFlagMarked dirty = True ! if self.write_field: format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format) prop.Value = prob dirty = True ! if dirty: outlook_message.Save() From tim_one@users.sourceforge.net Sun Oct 20 21:20:36 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 20 Oct 2002 13:20:36 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv19850 Modified Files: addin.py Log Message: ShowClues(): apply cgi.escape() to words (angle brackets were confusing Outlook's HTML display). Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** addin.py 20 Oct 2002 18:53:06 -0000 1.7 --- addin.py 20 Oct 2002 20:20:34 -0000 1.8 *************** *** 114,117 **** --- 114,119 ---- def ShowClues(mgr, app): + from cgi import escape + sel = app.ActiveExplorer().Selection if sel.Count == 0: *************** *** 143,147 **** max_word_len = max(map(len, words)) for word, prob in zip(words, probs): ! push(word + ' ' * (max_word_len - len(word))) push(' ' + prob + '\n') push("\n") --- 145,149 ---- max_word_len = max(map(len, words)) for word, prob in zip(words, probs): ! push(escape(word) + ' ' * (max_word_len - len(word))) push(' ' + prob + '\n') push("\n") From mhammond@users.sourceforge.net Mon Oct 21 00:51:06 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 16:51:06 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.8,1.9 classify.py,1.6,1.7 filter.py,1.7,1.8 manager.py,1.13,1.14 train.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv8686 Modified Files: addin.py classify.py filter.py manager.py train.py Log Message: Standardize where messages are pulled apart into a text stream so everyone is consistent. Append *both* the HTML body and the plain text body to the stream (some spam has the payload in *both*) Also change the way we attach the original message in the "Spam Clues" so it can effectively be mailed (the old code kept a reference to the FQN of Tim's .pst file ) Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** addin.py 20 Oct 2002 20:20:34 -0000 1.8 --- addin.py 20 Oct 2002 23:51:04 -0000 1.9 *************** *** 130,140 **** mapi_message = mgr.mapi.GetMessage(item.EntryID) ! headers = mapi_message.Fields[0x7D001E].Value ! headers = headers.encode('ascii', 'replace') ! body = mapi_message.Text.encode('ascii', 'replace') ! text = headers + body hammie = mgr.MakeHammie() ! prob, clues = hammie.score(text, evidence=True) ! new_msg = app.CreateItem(0) body = ["

Calculated Probability: %g


" % prob] --- 130,136 ---- mapi_message = mgr.mapi.GetMessage(item.EntryID) ! stream = mgr.GetBayesStreamForMessage(mapi_message) hammie = mgr.MakeHammie() ! prob, clues = hammie.score(stream, evidence=True) new_msg = app.CreateItem(0) body = ["

Calculated Probability: %g


" % prob] *************** *** 155,159 **** new_msg.HTMLBody = "" + body + "" # Attach the source message to it ! new_msg.Attachments.Add(item, constants.olEmbeddeditem, DisplayName="SPAM") new_msg.Display() --- 151,155 ---- new_msg.HTMLBody = "" + body + "" # Attach the source message to it ! new_msg.Attachments.Add(item, constants.olByValue, DisplayName="Original Message") new_msg.Display() Index: classify.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** classify.py 20 Oct 2002 12:20:52 -0000 1.6 --- classify.py 20 Oct 2002 23:51:04 -0000 1.7 *************** *** 23,31 **** try: progress.tick() ! headers = message.Fields[0x7D001E].Value ! headers = headers.encode('ascii', 'replace') ! body = message.Text.encode('ascii', 'replace') ! text = headers + body ! prob = hammie.score(text, evidence=False) added_prop = False try: --- 23,28 ---- try: progress.tick() ! stream = mgr.GetBayesStreamForMessage(message) ! prob = hammie.score(stream, evidence=False) added_prop = False try: Index: filter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** filter.py 20 Oct 2002 18:53:06 -0000 1.7 --- filter.py 20 Oct 2002 23:51:04 -0000 1.8 *************** *** 11,18 **** def filter_message(message, mgr): try: ! headers = message.Fields[0x7D001E].Value ! headers = headers.encode('ascii', 'replace') ! body = message.Text.encode('ascii', 'replace') ! text = headers + body except pythoncom.com_error, d: print "Failed to get a message: %s" % (d,) --- 11,15 ---- def filter_message(message, mgr): try: ! stream = mgr.GetBayesStreamForMessage(message) except pythoncom.com_error, d: print "Failed to get a message: %s" % (d,) *************** *** 20,24 **** hammie = mgr.MakeHammie() ! prob = hammie.score(text, evidence=False) num_rules = 0 for rule in mgr.config.rules: --- 17,21 ---- hammie = mgr.MakeHammie() ! prob = hammie.score(stream, evidence=False) num_rules = 0 for rule in mgr.config.rules: Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** manager.py 20 Oct 2002 18:53:06 -0000 1.13 --- manager.py 20 Oct 2002 23:51:04 -0000 1.14 *************** *** 82,85 **** --- 82,98 ---- return existing + def GetBayesStreamForMessage(self, message): + # Note - caller must catch COM error + headers = message.Fields[0x7D001E].Value + headers = headers.encode('ascii', 'replace') + try: + body = message.Fields[0x1013001E].Value # HTMLBody field + body = body.encode("ascii", "replace") + "\n" + except pythoncom.error: + body = "" + body += message.Text.encode("ascii", "replace") + return headers + body + + def LoadBayes(self): if not os.path.exists(self.ini_filename): *************** *** 117,121 **** print ("Created new configuration file '%s'" % self.config_filename) ! return _ConfigurationRoot() try: --- 130,134 ---- print ("Created new configuration file '%s'" % self.config_filename) ! return config.ConfigurationRoot() try: Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** train.py 20 Oct 2002 07:50:36 -0000 1.4 --- train.py 20 Oct 2002 23:51:04 -0000 1.5 *************** *** 1,4 **** # Train a classifier from Outlook Mail folders ! # Author: Sean D. True, WebReply.Com # October, 2002 # Copyright PSF, license under the PSF license --- 1,4 ---- # Train a classifier from Outlook Mail folders ! # Author: Sean D. True, WebReply.Com, Mark Hammond # October, 2002 # Copyright PSF, license under the PSF license *************** *** 11,14 **** --- 11,15 ---- def train_folder( f, isspam, mgr, progress): from tokenizer import tokenize + num = 0 for message in mgr.YieldMessageList(f): if progress.stop_requested(): *************** *** 18,29 **** # work with MAPI until we work out how to get headers from outlook message = mgr.mapi.GetMessage(message.ID) ! headers = message.Fields[0x7D001E].Value ! headers = headers.encode('ascii', 'replace') ! body = message.Text.encode('ascii', 'replace') ! except pythoncom.com_error: progress.warning("failed to get a message") continue ! text = headers + body ! mgr.bayes.learn(tokenize(text), isspam, False) # Called back from the dialog to do the actual training. --- 19,30 ---- # work with MAPI until we work out how to get headers from outlook message = mgr.mapi.GetMessage(message.ID) ! stream = mgr.GetBayesStreamForMessage(message) ! except pythoncom.com_error, d: progress.warning("failed to get a message") + print "Failed to get a message", d continue ! mgr.bayes.learn(tokenize(stream), isspam, False) ! num += 1 ! print "Trained over", num, "in folder", f.Name # Called back from the dialog to do the actual training. From mhammond@users.sourceforge.net Mon Oct 21 02:38:13 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 18:38:13 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 README.txt,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv25166 Modified Files: README.txt Log Message: Few more notes. Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/README.txt,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** README.txt 20 Oct 2002 08:42:37 -0000 1.3 --- README.txt 21 Oct 2002 01:38:10 -0000 1.4 *************** *** 5,16 **** ** NOTE ** - You also need CDO installed. This comes with Outlook 2k, but is ! not installed by default. You may need to find your Office 2000 CD, select ! Add/Remove components, and find CDO under Outlook. If you see a COM error ! compaining about "MAPI.Session", this is your problem. Outlook Addin ========== ! If you execute "addin.py", a Microsoft Outlook plugin will be installed. ! Next time outlook is started, you should see a "Anti-Spam" button on the toolbar. Clicking it will allow you to maintain your bayes database and filters. --- 5,17 ---- ** NOTE ** - You also need CDO installed. This comes with Outlook 2k, but is ! not installed by default. Attempting to install the add-in will detect this ! situation, and print instructions how to install CDO. Note however that ! running the stand-alone scripts (see below) will generally just print the raw ! Python exception - generally a semi-incomprehensible COM exception. Outlook Addin ========== ! If you execute "addin.py", the Microsoft Outlook plugin will be installed. ! Next time outlook is started, you should see a "Anti-Spam" drop-down on the toolbar. Clicking it will allow you to maintain your bayes database and filters. *************** *** 19,23 **** directory contains a number of other files (see below) which can be used to access features of the bayes database and filters from outside ! of the Outlook environment. Either way, the functionality is the same. To see any output from the addin (eg, Python print statements) you can either --- 20,25 ---- directory contains a number of other files (see below) which can be used to access features of the bayes database and filters from outside ! of the Outlook environment. Either way, the functionality is the same (except ! filtering of new mail obviously only works in the Outlook environment) To see any output from the addin (eg, Python print statements) you can either *************** *** 28,47 **** NOTE: If the addin fails to load, Outlook will automatically disable it for the next time Outlook starts. Re-executing 'addin.py' will ensure ! the addin is enabled. Filtering -------- When running from Outlook, you can enable filtering for all mail that arrives ! in your Inbox (or any other filter). Note that Outlook's builtin rules will ! fire before this notification, and if these rules move the message, it will never appear in the inbox (and thus will not get spam-filtered by a simple ! Inbox filter) You can define any number of filters to apply, each performing a different action or testing a different spam probability. You can enable and disable ! any rules, and you can "bulk-apply" a filter to an entire folder. Note that the rule ordering can be important, as if early rules move ! a message, later rules will not fire for that message. Command Line Tools --- 30,53 ---- NOTE: If the addin fails to load, Outlook will automatically disable it for the next time Outlook starts. Re-executing 'addin.py' will ensure ! the addin is enabled (you can also locate and enable the addin via the ! labyrinth of Outlook preference dialogs.) If this happens and you have ! the Python exception that caused the failure (via the tracing mentioned ! above) please send it to Mark. Filtering -------- When running from Outlook, you can enable filtering for all mail that arrives ! in your Inbox (or any other folder). Note that Outlook's builtin rules will ! fire before this notification, and if these rules move the message it will never appear in the inbox (and thus will not get spam-filtered by a simple ! Inbox filter). You can watch as many folders for Spam as you like. You can define any number of filters to apply, each performing a different action or testing a different spam probability. You can enable and disable ! any rule, and you can "Filter Now" an entire folder in one step. Note that the rule ordering can be important, as if early rules move ! a message, later rules will not fire for that message (cos MAPI ! appears to make access to the message once moved impossible) Command Line Tools *************** *** 70,74 **** =========== Sean reports bad output saving very large classifiers in training.py. ! Somewhere over 4MB, they seem to stop working. Outlook will occasionally complain that folders are corrupted after running --- 76,81 ---- =========== Sean reports bad output saving very large classifiers in training.py. ! Somewhere over 4MB, they seem to stop working. Mark's hasn't got ! that big yet - just over 2MB and going strong. Outlook will occasionally complain that folders are corrupted after running From tim.one@comcast.net Mon Oct 21 03:42:39 2002 From: tim.one@comcast.net (Tim Peters) Date: Sun, 20 Oct 2002 22:42:39 -0400 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.8,1.9 classify.py,1.6,1.7 filter.py,1.7,1.8 manager.py,1.13,1.14 train.py,1.4,1.5 In-Reply-To: Message-ID: [Mark Hammond] > Modified Files: > addin.py classify.py filter.py manager.py train.py > Log Message: > Standardize where messages are pulled apart into a text stream so > everyone is consistent. Append *both* the HTML body and the plain > text body to the stream (some spam has the payload in *both*) This was a good idea. The problem is that there's no MIME structure in the generated string, so the tokenizer doesn't "see" either the HTML body or the plain text body. This is a problem for every msg with a MIME multipart or alternative Content-Type in the original headers, HTML or not: the Content-Type header specifies a boundary tag to look for in the body, but the boundary tag doesn't exist in the body in this reconstituted string. I started to suspect something fishy when I saw that "naked" in a ham msg had a neutral spamprob. It's because the tokenizer has rarely *seen* the "naked"s in the 100s of porn spams I trained on -- it doesn't see anything from the body in most of them. The suspicion intensified when some screamingly obvious spam showed up in my Unsure folder. The good news is that almost all of my spam is getting caught anyway, despite that most spam is getting judged solely by the tiny subset of header lines we don't ignore by default! In effect, it's rediscovered my one previous by-hand spam rule: "If it came from my MSN account, it's spam". But it's a hell of a lot better than that was, even running nearly blind. Unclear to me what to do about this; Outlook doesn't make life easy here. From mhammond@users.sourceforge.net Mon Oct 21 03:54:25 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Sun, 20 Oct 2002 19:54:25 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv27546 Modified Files: addin.py Log Message: Cleanup and comment the code. Tried to make a few obsene lines fit in 80 characters too. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** addin.py 20 Oct 2002 23:51:04 -0000 1.9 --- addin.py 21 Oct 2002 02:54:23 -0000 1.10 *************** *** 66,82 **** def CastTo(ob, target): """'Cast' a COM object to another type""" if hasattr(target, "index"): # string like # for now, we assume makepy for this to work. ! if not ob.__class__.__dict__.get("CLSID"): # Eeek - no makepy support - try and build it. ob = gencache.EnsureDispatch(ob) ! if not ob.__class__.__dict__.get("CLSID"): raise ValueError, "Must be a makepy-able object for this to work" clsid = ob.CLSID mod = gencache.GetModuleForCLSID(clsid) # Get the 'root' module. ! mod = gencache.GetModuleForTypelib(mod.CLSID, mod.LCID, mod.MajorVersion, mod.MinorVersion) # Find the CLSID of the target # XXX - should not be looking in VTables..., but no general map currently exists target_clsid = mod.VTablesNamesToIIDMap.get(target) mod = gencache.GetModuleForCLSID(target_clsid) target_class = getattr(mod, target) --- 66,95 ---- def CastTo(ob, target): """'Cast' a COM object to another type""" + # todo - should support target being an IID if hasattr(target, "index"): # string like # for now, we assume makepy for this to work. ! if not ob.__class__.__dict__.has_key("CLSID"): ! # Eeek - no makepy support - try and build it. ob = gencache.EnsureDispatch(ob) ! if not ob.__class__.__dict__.has_key("CLSID"): raise ValueError, "Must be a makepy-able object for this to work" clsid = ob.CLSID + # Lots of hoops to support "demand-build" - ie, generating + # code for an interface first time it is used. We assume the + # interface name exists in the same library as the object. + # This is generally the case - only referenced typelibs may be + # a problem, and we can handle that later. Maybe + # So get the generated module for the library itself, then + # find the interface CLSID there. mod = gencache.GetModuleForCLSID(clsid) # Get the 'root' module. ! mod = gencache.GetModuleForTypelib(mod.CLSID, mod.LCID, ! mod.MajorVersion, mod.MinorVersion) # Find the CLSID of the target # XXX - should not be looking in VTables..., but no general map currently exists target_clsid = mod.VTablesNamesToIIDMap.get(target) + if target_clsid is None: + raise ValueError, "The interface name '%s' does not appear in the " \ + "same library as object '%r'" % (target, ob) mod = gencache.GetModuleForCLSID(target_clsid) target_class = getattr(mod, target) *************** *** 96,102 **** class FolderItemsEvent: - def __del__(self): - print "Event dieing" - def Init(self, target, application, manager): self.application = application --- 109,112 ---- *************** *** 109,113 **** import filter num_rules = filter.filter_message(mapi_message, self.manager) ! print "%d Spam rules fired for message '%s'" % (num_rules, item.Subject.encode("ascii", "replace")) else: print "Spam filtering is disabled - ignoring new message" --- 119,124 ---- import filter num_rules = filter.filter_message(mapi_message, self.manager) ! print "%d Spam rules fired for message '%s'" \ ! % (num_rules, item.Subject.encode("ascii", "replace")) else: print "Spam filtering is disabled - ignoring new message" *************** *** 126,130 **** item = sel.Item(1) if item.Class != constants.olMail: ! win32ui.MessageBox("This function can only be performed on mail items", "Not a mail message") return --- 137,142 ---- item = sel.Item(1) if item.Class != constants.olMail: ! win32ui.MessageBox("This function can only be performed on mail items", ! "Not a mail message") return *************** *** 187,211 **** popup.TooltipText = "Anti-Spam filters and functions" popup.Enabled = True popup = CastTo(popup, "CommandBarPopup") ! ! item = popup.Controls.Add(Type=constants.msoControlButton, Temporary=True) ! # Hook events for the item ! item = DispatchWithEvents(item, ButtonEvent) ! item.Init(ShowClues, (self.manager, application)) ! item.Caption="Show spam clues for current message" ! item.Enabled = True ! self.buttons.append(item) ! ! item = popup.Controls.Add(Type=constants.msoControlButton, Temporary=True) ! # Hook events for the item ! item = DispatchWithEvents(item, ButtonEvent) ! item.Init(manager.ShowManager, (self.manager,)) ! item.Caption="Options..." ! item.TooltipText = "Define anti-spam filters" ! item.Enabled = True ! self.buttons.append(item) self.FiltersChanged() def FiltersChanged(self): # Create a notification hook for all folders we filter. --- 199,227 ---- popup.TooltipText = "Anti-Spam filters and functions" popup.Enabled = True + # Convert from "CommandBarItem" to derived "CommandBarPopup" + # Not sure if we should be able to work this out ourselves, but no + # introspection I tried seemed to indicate we can. VB does it via + # strongly-typed declarations. popup = CastTo(popup, "CommandBarPopup") ! # And add our children. ! self._AddPopup(popup, ShowClues, (self.manager, application), ! Caption="Show spam clues for current message", ! Enabled=True) ! self._AddPopup(popup, manager.ShowManager, (self.manager,), ! Caption="Anti-Spam Manager...", ! TooltipText = "Show the Anti-Spam manager dialog.", ! Enabled = True) self.FiltersChanged() + def _AddPopup(self, parent, target, target_args, **item_attrs): + item = parent.Controls.Add(Type=constants.msoControlButton, Temporary=True) + # Hook events for the item + item = DispatchWithEvents(item, ButtonEvent) + item.Init(target, target_args) + for attr, val in item_attrs.items(): + setattr(item, attr, val) + self.buttons.append(item) + def FiltersChanged(self): # Create a notification hook for all folders we filter. *************** *** 214,218 **** def UpdateFolderHooks(self): new_hooks = {} ! for mapi_folder in self.manager.BuildFolderList(self.manager.config.filter.folder_ids, self.manager.config.filter.include_sub): eid = mapi_folder.ID existing = self.folder_hooks.get(eid) --- 230,236 ---- def UpdateFolderHooks(self): new_hooks = {} ! for mapi_folder in self.manager.BuildFolderList( ! self.manager.config.filter.folder_ids, ! self.manager.config.filter.include_sub): eid = mapi_folder.ID existing = self.folder_hooks.get(eid) *************** *** 245,248 **** --- 263,276 ---- self.buttons = None + print "Addin terminating: %d COM client and %d COM servers exist." \ + % (pythoncom._GetInterfaceCount(), pythoncom._GetGatewayCount()) + try: + # will be available if "python_d addin.py" is used to + # register the addin. + total_refs = sys.gettotalrefcount() # debug Python builds only + print "%d Python references exist" % (total_refs,) + except AttributeError: + pass + def OnAddInsUpdate(self, custom): print "SpamAddin - OnAddInsUpdate", custom *************** *** 252,259 **** print "SpamAddin - OnBeginShutdown", custom - def RegisterAddin(klass): import _winreg ! key = _winreg.CreateKey(_winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Office\\Outlook\\Addins") subkey = _winreg.CreateKey(key, klass._reg_progid_) _winreg.SetValueEx(subkey, "CommandLineSafe", 0, _winreg.REG_DWORD, 0) --- 280,287 ---- print "SpamAddin - OnBeginShutdown", custom def RegisterAddin(klass): import _winreg ! key = _winreg.CreateKey(_winreg.HKEY_CURRENT_USER, ! "Software\\Microsoft\\Office\\Outlook\\Addins") subkey = _winreg.CreateKey(key, klass._reg_progid_) _winreg.SetValueEx(subkey, "CommandLineSafe", 0, _winreg.REG_DWORD, 0) *************** *** 265,269 **** import _winreg try: ! _winreg.DeleteKey(_winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Office\\Outlook\\Addins\\" + klass._reg_progid_) except WindowsError: pass --- 293,299 ---- import _winreg try: ! _winreg.DeleteKey(_winreg.HKEY_CURRENT_USER, ! "Software\\Microsoft\\Office\\Outlook\\Addins\\" \ ! + klass._reg_progid_) except WindowsError: pass From popiel@users.sourceforge.net Mon Oct 21 06:00:08 2002 From: popiel@users.sourceforge.net (T. Alexander Popiel) Date: Sun, 20 Oct 2002 22:00:08 -0700 Subject: [Spambayes-checkins] spambayes table.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv2626 Added Files: table.py Log Message: Added table.py, a tabular comparator. The output is not as detailed as cmp.py, but it's concise and usable with more than two files. --- NEW FILE: table.py --- #!/usr/bin/env python """ table.py base1 base2 ... baseN Combines output from base1.txt, base2.txt, etc., which are created by the TestDriver (such as timcv.py) output, and displays tabulated comparison statistics to stdout. Each input file is represented by one column in the table. """ import sys import re # Return # ( # ham tested, # spam tested, # total f-p, # total f-n, # total unsure, # average f-p rate, # average f-n rate, # average unsure rate, # real cost, # best cost, # ham score deviation for all runs, # spam score deviations for all runs, # ) # from summary file f. def suck(f): hamdevall = spamdevall = (0.0, 0.0) cost = 0.0 bestcost = 0.0 fp = 0 fn = 0 un = 0 fpp = 0.0 fnp = 0.0 unp = 0.0 htest = 0 stest = 0 get = f.readline while 1: line = get() if line.startswith('-> tested'): print line, htest = int(line.split()[3]) stest = int(line.split()[6]) if line.find(' items; mean ') != -1: # -> Ham distribution for this pair: 1000 items; mean 0.05; sample sdev 0.68 # and later "sample " went away vals = line.split(';') mean = float(vals[1].split()[-1]) sdev = float(vals[2].split()[-1]) val = (mean, sdev) typ = vals[0].split()[2] if line.find('for all runs') != -1: if typ == 'Ham': hamdevall = val else: spamdevall = val continue if line.startswith('-> best cost for all runs: $'): bestcost = float(line.split('$')[-1]) if line.startswith('-> all runs false positives: '): fp = int(line.split()[-1]) if line.startswith('-> all runs false negatives: '): fn = int(line.split()[-1]) if line.startswith('-> all runs unsure: '): un = int(line.split()[-1]) if line.startswith('-> all runs false positive %: '): fpp = float(line.split()[-1]) if line.startswith('-> all runs false negative %: '): fnp = float(line.split()[-1]) if line.startswith('-> all runs unsure %: '): unp = float(line.split()[-1]) if line.startswith('-> all runs cost: '): cost = float(line.split('$')[-1]) break if line.startswith('-> '): continue return (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, hamdevall, spamdevall) def windowsfy(fn): import os if os.path.exists(fn + '.txt'): return fn + '.txt' else: return fn ratio = "ham:spam: " fptot = "fp total: " fpper = "fp %: " fntot = "fn total: " fnper = "fn %: " untot = "unsure t: " unper = "unsure %: " rcost = "real cost:" bcost = "best cost:" hmean = "h mean: " hsdev = "h sdev: " smean = "s mean: " ssdev = "s sdev: " meand = "mean diff:" kval = "k: " for filename in sys.argv[1:]: filename = windowsfy(filename) (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, hamdevall, spamdevall) = suck(file(filename)) ratio += "%8s" % ("%d:%d" % (htest, stest)) fptot += "%8d" % fp fpper += "%8.2f" % fpp fntot += "%8d" % fn fnper += "%8.2f" % fnp untot += "%8d" % un unper += "%8.2f" % unp rcost += "%8s" % ("$%.2f" % cost) bcost += "%8s" % ("$%.2f" % bestcost) hmean += "%8.2f" % hamdevall[0] hsdev += "%8.2f" % hamdevall[1] smean += "%8.2f" % spamdevall[0] ssdev += "%8.2f" % spamdevall[1] meand += "%8.2f" % (spamdevall[0] - hamdevall[0]) k = (spamdevall[0] - hamdevall[0]) / (spamdevall[1] + hamdevall[1]) kval += "%8.2f" % k print ratio print fptot print fpper print fntot print fnper print untot print unper print rcost print bcost print hmean print hsdev print smean print ssdev print meand print kval From tim_one@users.sourceforge.net Mon Oct 21 18:28:47 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 21 Oct 2002 10:28:47 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.10,1.11 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv620 Modified Files: addin.py Log Message: ShowClues(): It's not a probability, so don't call it one. Also reduced the amount of horizontal whitespace in the clues listing; some msgs have extremely long generated "words" that pushed the spamprobs off the right edge of visible part of the generated msg. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** addin.py 21 Oct 2002 02:54:23 -0000 1.10 --- addin.py 21 Oct 2002 17:28:44 -0000 1.11 *************** *** 146,158 **** prob, clues = hammie.score(stream, evidence=True) new_msg = app.CreateItem(0) ! body = ["

Calculated Probability: %g


" % prob] push = body.append push("
\n")
!     words = ['%r' % word for word, prob in clues]
!     probs = ['%g' % prob for word, prob in clues]
!     max_word_len = max(map(len, words))
!     for word, prob in zip(words, probs):
!         push(escape(word) + ' ' * (max_word_len - len(word)))
!         push(' ' + prob + '\n')
      push("
\n") body = ''.join(body) --- 146,157 ---- prob, clues = hammie.score(stream, evidence=True) new_msg = app.CreateItem(0) ! body = ["

Spam Score: %g


" % prob] push = body.append + # Format the clues. push("
\n")
!     for word, prob in clues:
!         word = repr(word)
!         push(escape(word) + ' ' * (30 - len(word)))
!         push(' %g\n' % prob)
      push("
\n") body = ''.join(body) From tim_one@users.sourceforge.net Mon Oct 21 19:55:32 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 21 Oct 2002 11:55:32 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.14,1.15 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv4305 Modified Files: manager.py Log Message: GetBayesStreamForMessage(): For every msg with MIME structure, Outlook left the boundary info in the headers, but there are no boundaries in the body. As a result, all of the body was invisible to the Python email pkg. Reconstituting the full original email from Outlook appears to be a real bitch -- maybe Mozilla has code for this we can use (but I suspect its import-from-Outlook gimmick actually crawls over the .pst file; I haven't used it, just read about it). In the meantime, quick hack: squash the text part (if any) and the HTML part (if any) together as one big text blob, and if the headers make any claims about MIME type and/or transfer encoding, simply delete those header lines. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** manager.py 20 Oct 2002 23:51:04 -0000 1.14 --- manager.py 21 Oct 2002 18:55:30 -0000 1.15 *************** *** 84,87 **** --- 84,89 ---- def GetBayesStreamForMessage(self, message): # Note - caller must catch COM error + import email + headers = message.Fields[0x7D001E].Value headers = headers.encode('ascii', 'replace') *************** *** 92,97 **** body = "" body += message.Text.encode("ascii", "replace") ! return headers + body ! def LoadBayes(self): --- 94,109 ---- body = "" body += message.Text.encode("ascii", "replace") ! ! # XXX If this was originally a MIME msg, we're hosed at this point -- ! # the boundary tag in the headers doesn't exist in the body, and ! # the msg is simply ill-formed. The miserable hack here simply ! # squashes the text part (if any) and the HTML part (if any) together, ! # and strips MIME info from the original headers. ! msg = email.message_from_string(headers + '\n' + body) ! if msg.has_key('content-type'): ! del msg['content-type'] ! if msg.has_key('content-transfer-encoding'): ! del msg['content-transfer-encoding'] ! return msg def LoadBayes(self): From tim_one@users.sourceforge.net Mon Oct 21 22:18:57 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 21 Oct 2002 14:18:57 -0700 Subject: [Spambayes-checkins] spambayes table.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv11669 Modified Files: table.py Log Message: Minor fiddling, + changed to get the counts of total ham & spam tested out of the "all runs" histogram header line. Before it was picking up a wrong value from an interior test, provided you ran a test setup creating such a beast. Index: table.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/table.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** table.py 21 Oct 2002 05:00:05 -0000 1.1 --- table.py 21 Oct 2002 21:18:55 -0000 1.2 *************** *** 41,85 **** htest = 0 stest = 0 ! get = f.readline while 1: line = get() if line.startswith('-> tested'): print line, ! htest = int(line.split()[3]) ! stest = int(line.split()[6]) ! if line.find(' items; mean ') != -1: ! # -> Ham distribution for this pair: 1000 items; mean 0.05; sample sdev 0.68 ! # and later "sample " went away vals = line.split(';') mean = float(vals[1].split()[-1]) sdev = float(vals[2].split()[-1]) val = (mean, sdev) typ = vals[0].split()[2] if line.find('for all runs') != -1: if typ == 'Ham': hamdevall = val else: spamdevall = val ! continue ! if line.startswith('-> best cost for all runs: $'): bestcost = float(line.split('$')[-1]) ! if line.startswith('-> all runs false positives: '): fp = int(line.split()[-1]) ! if line.startswith('-> all runs false negatives: '): fn = int(line.split()[-1]) ! if line.startswith('-> all runs unsure: '): un = int(line.split()[-1]) ! if line.startswith('-> all runs false positive %: '): fpp = float(line.split()[-1]) ! if line.startswith('-> all runs false negative %: '): fnp = float(line.split()[-1]) ! if line.startswith('-> all runs unsure %: '): unp = float(line.split()[-1]) ! if line.startswith('-> all runs cost: '): cost = float(line.split('$')[-1]) break - if line.startswith('-> '): - continue return (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, --- 41,95 ---- htest = 0 stest = 0 ! get = f.readline while 1: line = get() if line.startswith('-> tested'): + # -> tested 1910 hams & 948 spams against 2741 hams & 948 spams + # 0 1 2 3 4 5 6 print line, ! ! elif line.find(' items; mean ') > 0 and line.find('for all runs') > 0: ! # -> Ham scores for all runs: 2741 items; mean 0.86; sdev 6.28 ! # 0 1 2 vals = line.split(';') mean = float(vals[1].split()[-1]) sdev = float(vals[2].split()[-1]) val = (mean, sdev) + ntested = int(vals[0].split()[-2]) typ = vals[0].split()[2] if line.find('for all runs') != -1: if typ == 'Ham': hamdevall = val + htest = ntested else: spamdevall = val ! stest = ntested ! ! elif line.startswith('-> best cost for all runs: $'): ! # -> best cost for all runs: $28.20 bestcost = float(line.split('$')[-1]) ! ! elif line.startswith('-> all runs false positives: '): fp = int(line.split()[-1]) ! ! elif line.startswith('-> all runs false negatives: '): fn = int(line.split()[-1]) ! ! elif line.startswith('-> all runs unsure: '): un = int(line.split()[-1]) ! ! elif line.startswith('-> all runs false positive %: '): fpp = float(line.split()[-1]) ! ! elif line.startswith('-> all runs false negative %: '): fnp = float(line.split()[-1]) ! ! elif line.startswith('-> all runs unsure %: '): unp = float(line.split()[-1]) ! ! elif line.startswith('-> all runs cost: '): cost = float(line.split('$')[-1]) break return (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, *************** *** 114,126 **** (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, hamdevall, spamdevall) = suck(file(filename)) ! ratio += "%8s" % ("%d:%d" % (htest, stest)) ! fptot += "%8d" % fp fpper += "%8.2f" % fpp ! fntot += "%8d" % fn fnper += "%8.2f" % fnp ! untot += "%8d" % un unper += "%8.2f" % unp ! rcost += "%8s" % ("$%.2f" % cost) ! bcost += "%8s" % ("$%.2f" % bestcost) hmean += "%8.2f" % hamdevall[0] hsdev += "%8.2f" % hamdevall[1] --- 124,136 ---- (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, hamdevall, spamdevall) = suck(file(filename)) ! ratio += "%8s" % ("%d:%d" % (htest, stest)) ! fptot += "%8d" % fp fpper += "%8.2f" % fpp ! fntot += "%8d" % fn fnper += "%8.2f" % fnp ! untot += "%8d" % un unper += "%8.2f" % unp ! rcost += "%8s" % ("$%.2f" % cost) ! bcost += "%8s" % ("$%.2f" % bestcost) hmean += "%8.2f" % hamdevall[0] hsdev += "%8.2f" % hamdevall[1] From tim_one@users.sourceforge.net Mon Oct 21 23:51:02 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 21 Oct 2002 15:51:02 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.11,1.12 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv10238 Modified Files: addin.py Log Message: Added missing import of win32ui. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** addin.py 21 Oct 2002 17:28:44 -0000 1.11 --- addin.py 21 Oct 2002 22:50:59 -0000 1.12 *************** *** 15,18 **** --- 15,19 ---- import pythoncom from win32com.client import constants + import win32ui # If we are not running in a console, redirect all print statements to the From tim_one@users.sourceforge.net Tue Oct 22 02:37:55 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 21 Oct 2002 18:37:55 -0700 Subject: [Spambayes-checkins] spambayes tokenizer.py,1.46,1.47 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv11217 Modified Files: tokenizer.py Log Message: Replace   with a blank. & doesn't appear to show up often enough to bother with. Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** tokenizer.py 30 Sep 2002 21:56:27 -0000 1.46 --- tokenizer.py 22 Oct 2002 01:37:53 -0000 1.47 *************** *** 1083,1089 **** yield t ! # Remove HTML/XML tags. if (part.get_content_type() == "text/plain" or not options.retain_pure_html_tags): text = html_re.sub(' ', text) --- 1083,1090 ---- yield t ! # Remove HTML/XML tags. Also  . if (part.get_content_type() == "text/plain" or not options.retain_pure_html_tags): + text = text.replace(' ', ' ') text = html_re.sub(' ', text) From tim_one@users.sourceforge.net Thu Oct 24 05:22:21 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 23 Oct 2002 21:22:21 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.12,1.13 classify.py,1.7,1.8 filter.py,1.8,1.9 manager.py,1.15,1.16 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv22299 Modified Files: addin.py classify.py filter.py manager.py Log Message: Indirecting thru Hammie (somtimes, not others) wasn't buying us anything. Gave BayesManager its own score() method. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** addin.py 21 Oct 2002 22:50:59 -0000 1.12 --- addin.py 24 Oct 2002 04:22:19 -0000 1.13 *************** *** 144,149 **** mapi_message = mgr.mapi.GetMessage(item.EntryID) stream = mgr.GetBayesStreamForMessage(mapi_message) ! hammie = mgr.MakeHammie() ! prob, clues = hammie.score(stream, evidence=True) new_msg = app.CreateItem(0) body = ["

Spam Score: %g


" % prob] --- 144,148 ---- mapi_message = mgr.mapi.GetMessage(item.EntryID) stream = mgr.GetBayesStreamForMessage(mapi_message) ! prob, clues = mgr.score(stream, evidence=True) new_msg = app.CreateItem(0) body = ["

Spam Score: %g


" % prob] Index: classify.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** classify.py 20 Oct 2002 23:51:04 -0000 1.7 --- classify.py 24 Oct 2002 04:22:19 -0000 1.8 *************** *** 10,14 **** def classify_folder( f, mgr, config, progress): - hammie = mgr.MakeHammie() messages = f.Messages pythoncom.CoInitialize() # We are called on a different thread. --- 10,13 ---- *************** *** 24,28 **** progress.tick() stream = mgr.GetBayesStreamForMessage(message) ! prob = hammie.score(stream, evidence=False) added_prop = False try: --- 23,27 ---- progress.tick() stream = mgr.GetBayesStreamForMessage(message) ! prob = mgr.score(stream) added_prop = False try: Index: filter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** filter.py 20 Oct 2002 23:51:04 -0000 1.8 --- filter.py 24 Oct 2002 04:22:19 -0000 1.9 *************** *** 16,21 **** return ! hammie = mgr.MakeHammie() ! prob = hammie.score(stream, evidence=False) num_rules = 0 for rule in mgr.config.rules: --- 16,20 ---- return ! prob = mgr.score(stream) num_rules = 0 for rule in mgr.config.rules: Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** manager.py 21 Oct 2002 18:55:30 -0000 1.15 --- manager.py 24 Oct 2002 04:22:19 -0000 1.16 *************** *** 28,32 **** import classifier ! import hammie # Suck in CDO type lib --- 28,32 ---- import classifier ! from tokenizer import tokenize # Suck in CDO type lib *************** *** 132,138 **** self.bayes_dirty = False - def MakeHammie(self): - return hammie.Hammie(self.bayes) - def LoadConfig(self): try: --- 132,135 ---- *************** *** 229,232 **** --- 226,232 ---- yield message message = messages.GetNext() + + def score(self, msg, evidence=False): + return self.bayes.spamprob(tokenize(msg), evidence) _mgr = None From tim_one@users.sourceforge.net Thu Oct 24 05:58:54 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 23 Oct 2002 21:58:54 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 default_bayes_customize.ini,1.1,1.2 manager.py,1.16,1.17 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv20955 Modified Files: default_bayes_customize.ini manager.py Log Message: Our (Outlook's) spambayes .ini file was ineffective, because the envar was set too late to do any good. Tried to repair that. Index: default_bayes_customize.ini =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/default_bayes_customize.ini,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** default_bayes_customize.ini 19 Oct 2002 16:24:41 -0000 1.1 --- default_bayes_customize.ini 24 Oct 2002 04:58:52 -0000 1.2 *************** *** 2,3 **** --- 2,6 ---- # This file must exist, or the addin considers itself confused. # As we decide default options, we can add them! + + [Classifier] + #use_chi_squared_combining: True Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** manager.py 24 Oct 2002 04:22:19 -0000 1.16 --- manager.py 24 Oct 2002 04:58:52 -0000 1.17 *************** *** 17,32 **** this_filename = os.path.abspath(sys.argv[0]) ! # This is a little of a hack . We are generally in a child directory of the ! # bayes code. To help installation, we handle the fact that this may not be ! # on sys.path. ! try: ! import classifier ! except ImportError: ! parent = os.path.abspath(os.path.join(os.path.dirname(this_filename), "..")) ! sys.path.insert(0, parent) ! del parent ! import classifier ! from tokenizer import tokenize # Suck in CDO type lib --- 17,40 ---- this_filename = os.path.abspath(sys.argv[0]) ! # This is a little bit of a hack . We are generally in a child directory ! # of the bayes code. To help installation, we handle the fact that this may ! # not be on sys.path. Note that doing these imports is delayed, so that we ! # can set the BAYESCUSTOMIZE envar first (if we import anything from the core ! # spambayes code before setting that envar, our .ini file may have no effect). ! def import_core_spambayes_stuff(ini_filename): ! global bayes_classifier, bayes_tokenize ! os.environ["BAYESCUSTOMIZE"] = ini_filename ! try: ! import classifier ! except ImportError: ! parent = os.path.abspath(os.path.join(os.path.dirname(this_filename), ! "..")) ! sys.path.insert(0, parent) ! ! import classifier ! from tokenizer import tokenize ! bayes_classifier = classifier ! bayes_tokenize = tokenize # Suck in CDO type lib *************** *** 60,63 **** --- 68,72 ---- os.chdir(cwd) + import_core_spambayes_stuff(self.ini_filename) self.LoadBayes() *************** *** 114,119 **** bayes = None try: ! os.environ["BAYESCUSTOMIZE"]=self.ini_filename ! bayes = cPickle.load(open(self.bayes_filename,'rb')) print "Loaded bayes database from '%s'" % (self.bayes_filename,) except IOError: --- 123,127 ---- bayes = None try: ! bayes = cPickle.load(open(self.bayes_filename, 'rb')) print "Loaded bayes database from '%s'" % (self.bayes_filename,) except IOError: *************** *** 161,166 **** def InitNewBayes(self): ! os.environ["BAYESCUSTOMIZE"]=self.ini_filename ! self.bayes = classifier.Bayes() self.bayes_dirty = True --- 169,173 ---- def InitNewBayes(self): ! self.bayes = bayes_classifier.Bayes() self.bayes_dirty = True *************** *** 228,232 **** def score(self, msg, evidence=False): ! return self.bayes.spamprob(tokenize(msg), evidence) _mgr = None --- 235,239 ---- def score(self, msg, evidence=False): ! return self.bayes.spamprob(bayes_tokenize(msg), evidence) _mgr = None From mhammond@users.sourceforge.net Thu Oct 24 14:06:41 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Thu, 24 Oct 2002 06:06:41 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs ClassifyDialog.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv19862/dialogs Modified Files: ClassifyDialog.py Log Message: Had a nice log message written - 3rd go at checkin in. Use extended MAPI. Much faster. Nearly got rid of MAPI.Session. Must sleep Index: ClassifyDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/ClassifyDialog.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** ClassifyDialog.py 20 Oct 2002 05:52:50 -0000 1.3 --- ClassifyDialog.py 24 Oct 2002 13:06:39 -0000 1.4 *************** *** 3,6 **** --- 3,7 ---- from pywin.mfc import dialog import win32con + import pythoncom import commctrl import win32ui From mhammond@users.sourceforge.net Thu Oct 24 14:06:41 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Thu, 24 Oct 2002 06:06:41 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 msgstore.py,NONE,1.1 addin.py,1.13,1.14 classify.py,1.8,1.9 filter.py,1.9,1.10 manager.py,1.17,1.18 rule.py,1.4,1.5 train.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv19862 Modified Files: addin.py classify.py filter.py manager.py rule.py train.py Added Files: msgstore.py Log Message: Had a nice log message written - 3rd go at checkin in. Use extended MAPI. Much faster. Nearly got rid of MAPI.Session. Must sleep --- NEW FILE: msgstore.py --- from __future__ import generators import sys, os # Abstract definition - can be moved out when we have more than one sub-class # External interface to this module is almost exclusively via a "folder ID" class MsgStoreException(Exception): pass class NotFoundException(MsgStoreException): pass class MsgStore: # Stash exceptions in the class for ease of use by consumers. MsgStoreException = MsgStoreException NotFoundException = NotFoundException def __init__(self): pass def Close(self): # Close this object and free everything raise NotImplementedError def GetFolderGenerator(self, folder_ids, include_sub): # Return a generator of MsgStoreFolder objects. raise NotImplementedError def GetFolder(self, folder_id): # Return a single folder given the ID. raise NotImplementedError def GetMessage(self, message_id): # Return a single message given the ID. raise NotImplementedError class MsgStoreFolder: def __init__(self): self.name = "" self.count = 0 def GetMessageGenerator(self, folder): # Return a generator of MsgStoreMsg objects for the folder raise NotImplementedError class MsgStoreMsg: def __init__(self): self.unread = False def GetEmailPackageObject(self): # Return a "read-only" Python email package object # "read-only" in that changes will never be reflected to the real store. raise NotImplementedError def SetField(self, name, value): # Abstractly set a user field name/id to a field value. # User field is for the user to see - status/internal fields # should get their own methods raise NotImplementedError def Save(self): # Save changes after field changes. raise NotImplementedError def MoveTo(self, folder_id): # Move the message to a folder. raise NotImplementedError def CopyTo(self, folder_id): # Copy the message to a folder. raise NotImplementedError # And some status ones we may hopefully use. def BeenFiltered(self): # Ever been filtered by us before raise NotImplementedError def GetTrainedCorpaName(self): # Return None, "ham" or "spam" raise NotImplementedError # Our MAPI implementation import warnings warnings.filterwarnings("ignore", category=FutureWarning, append=1) from win32com.client import Dispatch, constants from win32com.mapi import mapi from win32com.mapi.mapitags import * MESSAGE_MOVE = 0x1 # from MAPIdefs.h MYPR_BODY_HTML_A = 0x1013001e # magic MYPR_BODY_HTML_W = 0x1013001f # ditto USE_DEFERRED_ERRORS = mapi.MAPI_DEFERRED_ERRORS # or set to zero to see what changes class MAPIMsgStore(MsgStore): def __init__(self, outlook = None): self.outlook = outlook cwd = os.getcwd() mapi.MAPIInitialize(None) logonFlags = mapi.MAPI_NO_MAIL | mapi.MAPI_EXTENDED | mapi.MAPI_USE_DEFAULT self.session = mapi.MAPILogonEx(0, None, None, logonFlags) self._FindDefaultMessageStore() os.chdir(cwd) def Close(self): self.mapi_msgstore = None self.session.Logoff(0,0,0) self.session = None mapi.MAPIUninitialize() def _FindDefaultMessageStore(self): tab = self.session.GetMsgStoresTable(0) # restriction for the table. restriction = mapi.RES_PROPERTY, (mapi.RELOP_EQ, PR_DEFAULT_STORE, (PR_DEFAULT_STORE, True)) rows = mapi.HrQueryAllRows(tab, (PR_ENTRYID,), restriction, None, 0) # get first entry row = rows[0] eid_tag, eid = row[0] # Open the store. self.mapi_msgstore = self.session.OpenMsgStore(0, eid, None, mapi.MDB_WRITE | mapi.MDB_NO_MAIL | USE_DEFERRED_ERRORS ) def _GetSubFolderIter(self, folder): table = folder.GetHierarchyTable(0) rows = mapi.HrQueryAllRows(table, (PR_ENTRYID,PR_DISPLAY_NAME_A), None, None, 0) for (eid_tag, eid),(name_tag, name) in rows: sub = self.mapi_msgstore.OpenEntry(eid, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) table = sub.GetContentsTable(0) yield MAPIMsgStoreFolder(self, eid, name, table.GetRowCount(0)) folder = self.mapi_msgstore.OpenEntry(eid, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) for store_folder in self._GetSubFolderIter(folder): yield store_folder def GetFolderGenerator(self, folder_ids, include_sub): for folder_id in folder_ids: folder_id = mapi.BinFromHex(folder_id) folder = self.mapi_msgstore.OpenEntry(folder_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) table = folder.GetContentsTable(0) rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0) yield MAPIMsgStoreFolder(self, folder_id, props[0][1], table.GetRowCount(0)) if include_sub: for f in self._GetSubFolderIter(folder): yield f def GetFolder(self, folder_id): # Return a single folder given the ID. folder_id = mapi.BinFromHex(folder_id) folder = self.mapi_msgstore.OpenEntry(folder_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) table = folder.GetContentsTable(0) rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0) return MAPIMsgStoreFolder(self, folder_id, props[0][1], table.GetRowCount(0)) def GetMessage(self, message_id): # Return a single message given the ID. message_id = mapi.BinFromHex(message_id) prop_ids = PR_PARENT_ENTRYID, PR_CONTENT_UNREAD mapi_object = self.mapi_msgstore.OpenEntry(message_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) hr, data = mapi_object.GetProps(prop_ids,0) folder_eid = data[0][1] unread = data[1][1] folder = MAPIMsgStoreFolder(self, folder_eid, "Unknown - temp message", -1) return MAPIMsgStoreMsg(self, folder, message_id, unread) def GetOutlookObjectFromID(self, eid): if self.outlook is None: from win32com.client import Dispatch self.outlook = Dispatch("Outlook.Application") return self.outlook.Session.GetItemFromID(mapi.HexFromBin(eid)) _MapiTypeMap = { type(0.0): PT_DOUBLE, type(0): PT_I4, type(''): PT_STRING8, type(u''): PT_UNICODE, type(1==1): PT_BOOLEAN, } class MAPIMsgStoreFolder(MsgStoreMsg): def __init__(self, msgstore, id, name, count): self.msgstore = msgstore self.id = id self.name = name self.count = count def __repr__(self): return "<%s '%s' (%d items), id=%s>" % (self.__class__.__name__, self.name, self.count, mapi.HexFromBin(self.id)) def GetOutlookEntryID(self): return mapi.HexFromBin(self.id) def GetMessageGenerator(self): folder = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) table = folder.GetContentsTable(0) prop_ids = PR_ENTRYID, PR_CONTENT_UNREAD table.SetColumns(prop_ids, 0) while 1: # Getting 70 at a time was the random number that gave best perf for me ;) rows = table.QueryRows(70, 0) if len(rows)==0: break for row in rows: yield MAPIMsgStoreMsg(self.msgstore, self, row[0][1], row[1][1]) class MAPIMsgStoreMsg(MsgStoreMsg): def __init__(self, msgstore, folder, entryid, unread): self.folder = folder self.msgstore = msgstore self.mapi_object = None self.id = entryid self.unread = unread self.dirty = False def __repr__(self): urs = ["read", "unread"][self.unread] return "<%s, (%s) id=%s>" % (self.__class__.__name__, urs, mapi.HexFromBin(self.id)) def GetOutlookEntryID(self): return mapi.HexFromBin(self.id) def _GetMessageText(self): self._EnsureObject() prop_ids = PR_TRANSPORT_MESSAGE_HEADERS_A, PR_BODY, MYPR_BODY_HTML_A hr, data = self.mapi_object.GetProps(prop_ids,0) headers = data[0][1] if type(headers) != type(''): headers = '' # If no field will be an int error (the tag([0]) would tell us, but this is easier) body = data[1][1] if type(body) != type(''): body= '' # If no field will be an int error (the tag([0]) would tell us, but this is easier) # Messages with "text/html" and "multipart/*" give grief. # In some cases, the HTML body appears *only* accessible via Outlook :( Outlook is slow, so try and avoid # Tried using the "_W" props, and indeed tried dumping every prop - these HTML messages are hidden from Mapi! if PROP_TYPE(data[2][0])==PT_ERROR: # No HTML body - see if one of our problem children. html = "" lo_headers = headers.lower() if lo_headers.find("content-type: text/html")>=0 or lo_headers.find("content-type: multipart/")>=0: outlook_msg = self.msgstore.GetOutlookObjectFromID(self.id) html = outlook_msg.HTMLBody.encode("ascii", "replace") else: html = data[2][1] return headers + "\n" + html + "\n" + body def _EnsureObject(self): if self.mapi_object is None: self.mapi_object = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) def GetEmailPackageObject(self): import email # XXX If this was originally a MIME msg, we're hosed at this point -- # the boundary tag in the headers doesn't exist in the body, and # the msg is simply ill-formed. The miserable hack here simply # squashes the text part (if any) and the HTML part (if any) together, # and strips MIME info from the original headers. text = self._GetMessageText() try: msg = email.message_from_string(text) except: print "FAILED to create email.message from: ", `text` raise if msg.has_key('content-type'): del msg['content-type'] if msg.has_key('content-transfer-encoding'): del msg['content-transfer-encoding'] return msg def SetField(self, prop, val): self._EnsureObject() if type(prop)!=type(0): props = ( (mapi.PS_PUBLIC_STRINGS, prop), ) propIds = self.mapi_object.GetIDsFromNames(props, mapi.MAPI_CREATE) type_tag = _MapiTypeMap.get(type(val)) if type_tag is None: raise ValueError, "Dont know what to do with '%r' ('%s')" % (val, type(val)) prop = PROP_TAG( type_tag, PROP_ID(propIds[0])) if val is None: # Delete the property self.mapi_object.DeleteProps((prop,)) else: self.mapi_object.SetProps(((prop,val),)) self.dirty = True def Save(self): assert self.dirty, "asking me to save a clean message!" self.mapi_object.SaveChanges(mapi.KEEP_OPEN_READWRITE) self.dirty = False def _DoCopyMode(self, folder, isMove): ## self.mapi_object = None # release the COM pointer assert not self.dirty, "asking me to move a dirty message - later saves will fail!" dest_folder = self.msgstore.mapi_msgstore.OpenEntry(folder.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) source_folder = self.msgstore.mapi_msgstore.OpenEntry(self.folder.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) flags = 0 if isMove: flags |= MESSAGE_MOVE source_folder.CopyMessages( (self.id,), None, dest_folder, 0, None, flags) self.folder = self.msgstore.GetFolder(mapi.HexFromBin(folder.id)) def MoveTo(self, folder): self._DoCopyMode(folder, True) def CopyTo(self, folder): self._DoCopyMode(folder, True) def test(): from win32com.client import Dispatch outlook = Dispatch("Outlook.Application") eid = outlook.Session.GetDefaultFolder(constants.olFolderInbox).EntryID store = MAPIMsgStore() for folder in store.GetFolderGenerator([eid,], True): print folder for msg in folder.GetMessageGenerator(): print msg store.Close() if __name__=='__main__': test() Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** addin.py 24 Oct 2002 04:22:19 -0000 1.13 --- addin.py 24 Oct 2002 13:06:39 -0000 1.14 *************** *** 1,3 **** ! # Mark's Outlook addin import sys --- 1,3 ---- ! # SpamBayes Outlook Addin import sys *************** *** 117,123 **** def OnItemAdd(self, item): if self.manager.config.filter.enabled: ! mapi_message = self.manager.mapi.GetMessage(item.EntryID) import filter ! num_rules = filter.filter_message(mapi_message, self.manager) print "%d Spam rules fired for message '%s'" \ % (num_rules, item.Subject.encode("ascii", "replace")) --- 117,123 ---- def OnItemAdd(self, item): if self.manager.config.filter.enabled: ! msgstore_message = self.manager.message_store.GetMessage(item.EntryID) import filter ! num_rules = filter.filter_message(msgstore_message, self.manager) print "%d Spam rules fired for message '%s'" \ % (num_rules, item.Subject.encode("ascii", "replace")) *************** *** 142,148 **** return ! mapi_message = mgr.mapi.GetMessage(item.EntryID) ! stream = mgr.GetBayesStreamForMessage(mapi_message) ! prob, clues = mgr.score(stream, evidence=True) new_msg = app.CreateItem(0) body = ["

Spam Score: %g


" % prob] --- 142,147 ---- return ! msgstore_message = mgr.message_store.GetMessage(item.EntryID) ! prob, clues = mgr.score(msgstore_message, evidence=True) new_msg = app.CreateItem(0) body = ["

Spam Score: %g


" % prob] *************** *** 155,158 **** --- 154,164 ---- push(' %g\n' % prob) push("\n") + # Now the raw text of the message, as best we can + push("

Message Stream:


") + push("
\n")
+     txt = msgstore_message.GetEmailPackageObject().as_string(unixfrom=1)
+     import cgi
+     push(cgi.escape(txt, True))
+     push("
\n") body = ''.join(body) *************** *** 184,188 **** # Create our bayes manager import manager ! self.manager = manager.GetManager() assert self.manager.addin is None, "Should not already have an addin" self.manager.addin = self --- 190,194 ---- # Create our bayes manager import manager ! self.manager = manager.GetManager(application) assert self.manager.addin is None, "Should not already have an addin" self.manager.addin = self *************** *** 229,239 **** def UpdateFolderHooks(self): new_hooks = {} ! for mapi_folder in self.manager.BuildFolderList( self.manager.config.filter.folder_ids, self.manager.config.filter.include_sub): ! eid = mapi_folder.ID existing = self.folder_hooks.get(eid) if existing is None: ! folder = self.application.GetNamespace("MAPI").GetFolderFromID(eid) try: new_hook = DispatchWithEvents(folder.Items, FolderItemsEvent) --- 235,245 ---- def UpdateFolderHooks(self): new_hooks = {} ! for msgstore_folder in self.manager.message_store.GetFolderGenerator( self.manager.config.filter.folder_ids, self.manager.config.filter.include_sub): ! eid = msgstore_folder.GetOutlookEntryID() existing = self.folder_hooks.get(eid) if existing is None: ! folder = self.application.Session.GetFolderFromID(eid) try: new_hook = DispatchWithEvents(folder.Items, FolderItemsEvent) Index: classify.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** classify.py 24 Oct 2002 04:22:19 -0000 1.8 --- classify.py 24 Oct 2002 13:06:39 -0000 1.9 *************** *** 1,4 **** ! # Train a classifier from Outlook Mail folders ! # Author: Sean D. True, WebReply.Com # October, 2002 # Copyright PSF, license under the PSF license --- 1,4 ---- ! # Classify a folder with a field ! # Authors: Sean D. True, WebReply.Com, Mark Hammond. # October, 2002 # Copyright PSF, license under the PSF license *************** *** 10,52 **** def classify_folder( f, mgr, config, progress): ! messages = f.Messages ! pythoncom.CoInitialize() # We are called on a different thread. ! # We must get outlook in this thread - can't use the main thread :( ! outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI") ! ! if not messages: ! progress.warning("Can't find messages in folder '%s'" % (f.Name,)) ! return ! message = messages.GetFirst() ! while not progress.stop_requested() and message: ! try: ! progress.tick() ! stream = mgr.GetBayesStreamForMessage(message) ! prob = mgr.score(stream) ! added_prop = False ! try: ! if outlook_ns is not None: ! outlookItem = outlook_ns.GetItemFromID(message.ID) ! format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. ! prop = outlookItem.UserProperties.Add(config.field_name, constants.olNumber, True, format) ! prop.Value = prob ! outlookItem.Save() ! added_prop = True ! except "foo": # pythoncom.com_error, d: ! # Hrm - don't seem able to use outlook - use MAPI - but this ! # means the field doesn't automatically appear in the outlook "Field Chooser" ! # Tried explicity adding the field to the folder but still no go. ! added_prop = False ! if not added_prop: ! message.Fields.Add(config.field_name, 5, prob) ! ! message.Update() ! except pythoncom.com_error, d: ! progress.warning("Failed to get a message: %s" % (str(d),) ) ! message = messages.GetNext() # Called back from the dialog to do the actual training. def classifier(mgr, progress): - session = mgr.mapi config = mgr.config.classify if not config.folder_ids: --- 10,23 ---- def classify_folder( f, mgr, config, progress): ! for message in f.GetMessageGenerator(): ! if progress.stop_requested(): ! break ! progress.tick() ! prob = mgr.score(message) ! message.SetField(config.field_name, prob) ! message.Save() # Called back from the dialog to do the actual training. def classifier(mgr, progress): config = mgr.config.classify if not config.folder_ids: *************** *** 54,68 **** return progress.set_status("Counting messages") - folders = mgr.BuildFolderList(config.folder_ids, config.include_sub) num_msgs = 0 ! for f in folders: ! num_msgs += f.Messages.Count + 1 progress.set_max_ticks(num_msgs+3) ! for f in folders: ! progress.set_status("Processing folder '%s'" % (f.Name.encode("ascii", "replace"),)) classify_folder(f, mgr, config, progress) if progress.stop_requested(): return --- 25,39 ---- return progress.set_status("Counting messages") num_msgs = 0 ! for f in mgr.message_store.GetFolderGenerator(config.folder_ids, config.include_sub): ! num_msgs += f.count progress.set_max_ticks(num_msgs+3) ! for f in mgr.message_store.GetFolderGenerator(config.folder_ids, config.include_sub): ! progress.set_status("Processing folder '%s'" % (f.name,)) classify_folder(f, mgr, config, progress) if progress.stop_requested(): return + progress.set_status("Classified %d messages." % (num_msgs,)) Index: filter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** filter.py 24 Oct 2002 04:22:19 -0000 1.9 --- filter.py 24 Oct 2002 13:06:39 -0000 1.10 *************** *** 5,20 **** import sys, os - from win32com.client import Dispatch, constants - import pythoncom import rule def filter_message(message, mgr): ! try: ! stream = mgr.GetBayesStreamForMessage(message) ! except pythoncom.com_error, d: ! print "Failed to get a message: %s" % (d,) ! return ! ! prob = mgr.score(stream) num_rules = 0 for rule in mgr.config.rules: --- 5,12 ---- import sys, os import rule def filter_message(message, mgr): ! prob = mgr.score(message) num_rules = 0 for rule in mgr.config.rules: *************** *** 32,40 **** only_unread = filter.only_unread num_messages = 0 ! for message in mgr.YieldMessageList(f): if progress.stop_requested(): break progress.tick() ! if only_unread and not message.Unread: continue if filter_message(message, mgr): --- 24,32 ---- only_unread = filter.only_unread num_messages = 0 ! for message in f.GetMessageGenerator(): if progress.stop_requested(): break progress.tick() ! if only_unread and not message.unread: continue if filter_message(message, mgr): *************** *** 50,61 **** progress.set_status("Counting messages") - folders = mgr.BuildFolderList(filter.folder_ids, filter.include_sub) num_msgs = 0 ! for f in folders: ! num_msgs += f.Messages.Count + 1 progress.set_max_ticks(num_msgs+3) num = 0 ! for f in folders: ! progress.set_status("Filtering folder '%s'" % (f.Name.encode("ascii", "replace"),)) num += filter_folder(f, mgr, progress, filter) if progress.stop_requested(): --- 42,52 ---- progress.set_status("Counting messages") num_msgs = 0 ! for f in mgr.message_store.GetFolderGenerator(filter.folder_ids, filter.include_sub): ! num_msgs += f.count progress.set_max_ticks(num_msgs+3) num = 0 ! for f in mgr.message_store.GetFolderGenerator(filter.folder_ids, filter.include_sub): ! progress.set_status("Filtering folder '%s'" % (f.name)) num += filter_folder(f, mgr, progress, filter) if progress.stop_requested(): Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** manager.py 24 Oct 2002 04:58:52 -0000 1.17 --- manager.py 24 Oct 2002 13:06:39 -0000 1.18 *************** *** 11,14 **** --- 11,15 ---- import config + import msgstore try: *************** *** 64,68 **** self.mapi = win32com.client.Dispatch("MAPI.Session") self.mapi.Logon(None, None, False, False) - self._tls = {thread.get_ident(): {"outlook": outlook} } self.outlook = outlook os.chdir(cwd) --- 65,68 ---- *************** *** 70,119 **** import_core_spambayes_stuff(self.ini_filename) self.LoadBayes() # Outlook gives us thread grief :( def WorkerThreadStarting(self): pythoncom.CoInitialize() - self._tls[thread.get_ident()] = {} def WorkerThreadEnding(self): - assert self._tls.has_key(thread.get_ident()), \ - "WorkerThreadStarting hasn't been called for this thread" - del self._tls[thread.get_ident()] pythoncom.CoUninitialize() - def GetOutlookForCurrentThread(self): - assert self._tls.has_key(thread.get_ident()), \ - "WorkerThreadStarting hasn't been called for this thread" - existing = self._tls[thread.get_ident()].get("outlook") - if not existing: - existing = win32com.client.Dispatch("Outlook.Application") - self._tls[thread.get_ident()]["outlook"] = existing - return existing - - def GetBayesStreamForMessage(self, message): - # Note - caller must catch COM error - import email - - headers = message.Fields[0x7D001E].Value - headers = headers.encode('ascii', 'replace') - try: - body = message.Fields[0x1013001E].Value # HTMLBody field - body = body.encode("ascii", "replace") + "\n" - except pythoncom.error: - body = "" - body += message.Text.encode("ascii", "replace") - - # XXX If this was originally a MIME msg, we're hosed at this point -- - # the boundary tag in the headers doesn't exist in the body, and - # the msg is simply ill-formed. The miserable hack here simply - # squashes the text part (if any) and the HTML part (if any) together, - # and strips MIME info from the original headers. - msg = email.message_from_string(headers + '\n' + body) - if msg.has_key('content-type'): - del msg['content-type'] - if msg.has_key('content-transfer-encoding'): - del msg['content-transfer-encoding'] - return msg - def LoadBayes(self): if not os.path.exists(self.ini_filename): --- 70,82 ---- import_core_spambayes_stuff(self.ini_filename) self.LoadBayes() + self.message_store = msgstore.MAPIMsgStore(outlook) # Outlook gives us thread grief :( def WorkerThreadStarting(self): pythoncom.CoInitialize() def WorkerThreadEnding(self): pythoncom.CoUninitialize() def LoadBayes(self): if not os.path.exists(self.ini_filename): *************** *** 123,126 **** --- 86,92 ---- bayes = None try: + # Ooops - Tim did it another way - checking this in before I get more conficts! + ## from Options import options + ## options.mergefiles([self.ini_filename]) bayes = cPickle.load(open(self.bayes_filename, 'rb')) print "Loaded bayes database from '%s'" % (self.bayes_filename,) *************** *** 169,172 **** --- 135,141 ---- def InitNewBayes(self): + # Ooops - Tim did it another way - checking this in before I get more conficts! + ## from Options import options + ## options.mergefiles([self.ini_filename]) self.bayes = bayes_classifier.Bayes() self.bayes_dirty = True *************** *** 203,246 **** self.bayes = None self.config = None ! self._tls = None ! ! def BuildFolderList(self, folder_ids, include_sub): ! ret = {} ! for id in folder_ids: ! subs = [] ! try: ! f = self.mapi.GetFolder(id) ! if include_sub: ! sub_ids = [] ! subs = f.Folders ! for i in range(1, subs.Count): ! sub_ids.append(subs.Item(i).ID) ! subs = self.BuildFolderList(sub_ids, True) ! except pythoncom.error: ! continue ! ret[id] = f ! for sub in subs: ! ret[sub.ID] = sub ! return ret.values() ! ! def YieldMessageList(self, folder): ! messages = folder.Messages ! if not messages: ! print "Can't find messages in folder '%s'" % (folder.Name,) ! return ! message = messages.GetFirst() ! while message is not None: ! yield message ! message = messages.GetNext() def score(self, msg, evidence=False): return self.bayes.spamprob(bayes_tokenize(msg), evidence) _mgr = None ! def GetManager(verbose=1): global _mgr if _mgr is None: ! _mgr = BayesManager(verbose=verbose) # If requesting greater verbosity, honour it if verbose > _mgr.verbose: --- 172,192 ---- self.bayes = None self.config = None ! if self.message_store is not None: ! self.message_store.Close() ! self.message_store = None def score(self, msg, evidence=False): + email = msg.GetEmailPackageObject() + # As Tim suggested in email, score should move to range(100) + # This is probably a good place to do it - anyone who wants the real + # float value can look at the "clues" return self.bayes.spamprob(bayes_tokenize(msg), evidence) _mgr = None ! def GetManager(outlook = None, verbose=1): global _mgr if _mgr is None: ! _mgr = BayesManager(outlook=outlook, verbose=verbose) # If requesting greater verbosity, honour it if verbose > _mgr.verbose: Index: rule.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/rule.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** rule.py 20 Oct 2002 18:53:06 -0000 1.4 --- rule.py 24 Oct 2002 13:06:39 -0000 1.5 *************** *** 3,8 **** import time - MAPI_E_NOT_FOUND = -2147221233 - class Rule: def __init__(self): --- 3,6 ---- *************** *** 14,18 **** self.flag_message = True self.write_field = True ! self.write_field_name = "SpamProb" self.folder_id = "" --- 12,16 ---- self.flag_message = True self.write_field = True ! self.write_field_name = "SpamScore" self.folder_id = "" *************** *** 34,79 **** return "You must specify the field name to create" - def _GetFolder(self, mgr): - try: - return mgr.mapi.GetFolder(self.folder_id) - except pythoncom.com_error: - return None - def Act(self, mgr, msg, prob): if mgr.verbose > 1: ! print "Rule '%s': %.2f->%.2f (%.2f) (%s)" % (self.name, self.min, self.max, prob, msg.Subject[:20].encode("ascii", "replace")) if prob < self.min or prob > self.max: return False - # Do mods before we move. - dirty = False - outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI") - try: - outlook_message = outlook_ns.GetItemFromID(msg.ID) - except pythoncom.com_error, (hr, desc, exc, arg): - if not exc or exc[5] != MAPI_E_NOT_FOUND: - raise - print "Warning: Can't open the message - it has probably been moved" - return False ! if self.flag_message: ! outlook_message.FlagRequest = "Check Spam" ! outlook_message.FlagStatus = constants.olFlagMarked ! dirty = True if self.write_field: ! format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number. ! prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format) ! prop.Value = prob ! dirty = True ! if dirty: ! outlook_message.Save() if self.action == "None": pass elif self.action == "Copy": ! outlook_message.Copy(outlook_ns.GetFolderFromID(self.folder_id)) elif self.action == "Move": ! outlook_message.Move(outlook_ns.GetFolderFromID(self.folder_id)) else: ! print "Eeek - bad action", self.action return True --- 32,60 ---- return "You must specify the field name to create" def Act(self, mgr, msg, prob): if mgr.verbose > 1: ! print "Rule '%s': %.2f->%.2f (%.2f) (%s)" % (self.name, self.min, self.max, prob, repr(msg)) if prob < self.min or prob > self.max: return False ! ## if self.flag_message: ! ## outlook_message.FlagRequest = "Check Spam" ! ## outlook_message.FlagStatus = constants.olFlagMarked ! ## dirty = True ! if self.write_field: ! msg.SetField(self.write_field_name, prob) ! msg.Save() if self.action == "None": pass elif self.action == "Copy": ! dest_folder = mgr.message_store.GetFolder(self.folder_id) ! msg.CopyTo(dest_folder) elif self.action == "Move": ! dest_folder = mgr.message_store.GetFolder(self.folder_id) ! msg.MoveTo(dest_folder) else: ! assert 0, "Eeek - bad action '%r'" % (self.action,) return True Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** train.py 20 Oct 2002 23:51:04 -0000 1.5 --- train.py 24 Oct 2002 13:06:39 -0000 1.6 *************** *** 1,38 **** # Train a classifier from Outlook Mail folders ! # Author: Sean D. True, WebReply.Com, Mark Hammond # October, 2002 # Copyright PSF, license under the PSF license ! import sys, os, os.path, getopt, cPickle, string ! import win32com.client ! import pythoncom ! import win32con def train_folder( f, isspam, mgr, progress): from tokenizer import tokenize num = 0 ! for message in mgr.YieldMessageList(f): if progress.stop_requested(): break progress.tick() ! try: ! # work with MAPI until we work out how to get headers from outlook ! message = mgr.mapi.GetMessage(message.ID) ! stream = mgr.GetBayesStreamForMessage(message) ! except pythoncom.com_error, d: ! progress.warning("failed to get a message") ! print "Failed to get a message", d ! continue mgr.bayes.learn(tokenize(stream), isspam, False) num += 1 ! print "Trained over", num, "in folder", f.Name # Called back from the dialog to do the actual training. def trainer(mgr, progress): - pythoncom.CoInitialize() config = mgr.config mgr.InitNewBayes() bayes = mgr.bayes - session = mgr.mapi if not config.training.ham_folder_ids or not config.training.spam_folder_ids: --- 1,26 ---- # Train a classifier from Outlook Mail folders ! # Authors: Sean D. True, WebReply.Com, Mark Hammond # October, 2002 # Copyright PSF, license under the PSF license ! import sys, os def train_folder( f, isspam, mgr, progress): from tokenizer import tokenize num = 0 ! for message in f.GetMessageGenerator(): if progress.stop_requested(): break progress.tick() ! stream = message.GetEmailPackageObject() mgr.bayes.learn(tokenize(stream), isspam, False) num += 1 ! print "Trained over", num, "in folder", f.name # Called back from the dialog to do the actual training. def trainer(mgr, progress): config = mgr.config mgr.InitNewBayes() bayes = mgr.bayes if not config.training.ham_folder_ids or not config.training.spam_folder_ids: *************** *** 40,58 **** return progress.set_status("Counting messages") ! ham_folders = mgr.BuildFolderList(config.training.ham_folder_ids, config.training.ham_include_sub) ! spam_folders = mgr.BuildFolderList(config.training.spam_folder_ids, config.training.ham_include_sub) num_msgs = 0 ! for f in ham_folders + spam_folders: ! num_msgs += f.Messages.Count + 1 progress.set_max_ticks(num_msgs+3) ! for f in ham_folders: ! progress.set_status("Processing good folder '%s'" % (f.Name.encode("ascii", "replace"),)) train_folder(f, 0, mgr, progress) if progress.stop_requested(): return ! for f in spam_folders: ! progress.set_status("Processing spam folder '%s'" % (f.Name.encode("ascii", "replace"),)) train_folder(f, 1, mgr, progress) if progress.stop_requested(): --- 28,48 ---- return progress.set_status("Counting messages") ! num_msgs = 0 ! for f in mgr.message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub): ! num_msgs += f.count ! for f in mgr.message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub): ! num_msgs += f.count ! progress.set_max_ticks(num_msgs+3) ! for f in mgr.message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub): ! progress.set_status("Processing good folder '%s'" % (f.name,)) train_folder(f, 0, mgr, progress) if progress.stop_requested(): return ! for f in mgr.message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub): ! progress.set_status("Processing spam folder '%s'" % (f.name,)) train_folder(f, 1, mgr, progress) if progress.stop_requested(): From mhammond@users.sourceforge.net Fri Oct 25 01:53:00 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Thu, 24 Oct 2002 17:53:00 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 msgstore.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv9869 Modified Files: msgstore.py Log Message: Reliable header and body extraction. Now only fails for me with forwarded messages - we still skip the text of attached forwards. Not even clear if we should include that text, but we could if desired. Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** msgstore.py 24 Oct 2002 21:52:57 -0000 1.2 --- msgstore.py 25 Oct 2002 00:52:57 -0000 1.3 *************** *** 79,82 **** --- 79,83 ---- from win32com.mapi import mapi from win32com.mapi.mapitags import * + import pythoncom MESSAGE_MOVE = 0x1 # from MAPIdefs.h *************** *** 154,162 **** return MAPIMsgStoreMsg(self, folder, message_id, unread) ! def GetOutlookObjectFromID(self, eid): ! if self.outlook is None: ! from win32com.client import Dispatch ! self.outlook = Dispatch("Outlook.Application") ! return self.outlook.Session.GetItemFromID(mapi.HexFromBin(eid)) --- 155,164 ---- return MAPIMsgStoreMsg(self, folder, message_id, unread) ! ## # Currently no need for this ! ## def GetOutlookObjectFromID(self, eid): ! ## if self.outlook is None: ! ## from win32com.client import Dispatch ! ## self.outlook = Dispatch("Outlook.Application") ! ## return self.outlook.Session.GetItemFromID(mapi.HexFromBin(eid)) *************** *** 212,235 **** return mapi.HexFromBin(self.id) def _GetMessageText(self): self._EnsureObject() ! prop_ids = PR_TRANSPORT_MESSAGE_HEADERS_A, PR_BODY, MYPR_BODY_HTML_A hr, data = self.mapi_object.GetProps(prop_ids,0) headers = data[0][1] ! if type(headers) != type(''): headers = '' # If no field will be an int error (the tag([0]) would tell us, but this is easier) ! body = data[1][1] ! if type(body) != type(''): body= '' # If no field will be an int error (the tag([0]) would tell us, but this is easier) ! # Messages with "text/html" and "multipart/*" give grief. ! # In some cases, the HTML body appears *only* accessible via Outlook :( Outlook is slow, so try and avoid ! # Tried using the "_W" props, and indeed tried dumping every prop - these HTML messages are hidden from Mapi! ! if PROP_TYPE(data[2][0])==PT_ERROR: ! # No HTML body - see if one of our problem children. ! html = "" ! lo_headers = headers.lower() ! if lo_headers.find("content-type: text/html")>=0 or lo_headers.find("content-type: multipart/")>=0: ! outlook_msg = self.msgstore.GetOutlookObjectFromID(self.id) ! html = outlook_msg.HTMLBody.encode("ascii", "replace") ! else: ! html = data[2][1] return headers + "\n" + html + "\n" + body --- 214,258 ---- return mapi.HexFromBin(self.id) + def _GetPropFromStream(self, prop_id): + try: + stream = self.mapi_object.OpenProperty(prop_id, pythoncom.IID_IStream, 0, 0) + chunks = [] + while 1: + chunk = stream.Read(1024) + if not chunk: break + chunks.append(chunk) + return "".join(chunks) + except pythoncom.com_error, d: + print "Error getting property from stream", d + return "" + + def _GetPotentiallyLargeStringProp(self, prop_id, row): + got_tag, got_val = row + if PROP_TYPE(got_tag)==PT_ERROR: + ret = "" + if got_val==mapi.MAPI_E_NOT_FOUND: + pass # No body for this message. + elif got_val==mapi.MAPI_E_NOT_ENOUGH_MEMORY: + # Too big for simple properties - get via a stream + ret = self._GetPropFromStream(prop_id) + else: + tag_name = mapiutil.GetPropTagName(prop_id) + err_string = mapiutil.GetScodeString(got_val) + print "Warning - failed to get property %s: %s" % (tag_name, err_string) + else: + ret = got_val + return ret + def _GetMessageText(self): + # This is finally reliable. The only messages this now fails for + # are for "forwarded" messages, where the forwards are actually + # in an attachment. Later. self._EnsureObject() ! prop_ids = PR_TRANSPORT_MESSAGE_HEADERS_A, PR_BODY_A, MYPR_BODY_HTML_A hr, data = self.mapi_object.GetProps(prop_ids,0) headers = data[0][1] ! headers = self._GetPotentiallyLargeStringProp(prop_ids[0], data[0]) ! body = self._GetPotentiallyLargeStringProp(prop_ids[1], data[1]) ! html = self._GetPotentiallyLargeStringProp(prop_ids[2], data[2]) return headers + "\n" + html + "\n" + body From mhammond@users.sourceforge.net Fri Oct 25 02:31:44 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Thu, 24 Oct 2002 18:31:44 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.18,1.19 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv17692 Modified Files: manager.py Log Message: I'm a fool - pass the right object to Bayes and it works a treat Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -d -r1.18 -r1.19 *** manager.py 24 Oct 2002 13:06:39 -0000 1.18 --- manager.py 25 Oct 2002 01:31:42 -0000 1.19 *************** *** 181,185 **** # This is probably a good place to do it - anyone who wants the real # float value can look at the "clues" ! return self.bayes.spamprob(bayes_tokenize(msg), evidence) _mgr = None --- 181,185 ---- # This is probably a good place to do it - anyone who wants the real # float value can look at the "clues" ! return self.bayes.spamprob(bayes_tokenize(email), evidence) _mgr = None From tim_one@users.sourceforge.net Fri Oct 25 02:25:58 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 24 Oct 2002 18:25:58 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 msgstore.py,1.4,1.5 train.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv13969 Modified Files: msgstore.py train.py Log Message: Whitespace normalization. Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** msgstore.py 25 Oct 2002 01:16:21 -0000 1.4 --- msgstore.py 25 Oct 2002 01:25:56 -0000 1.5 *************** *** 69,73 **** raise NotImplementedError ! # Our MAPI implementation import warnings --- 69,73 ---- raise NotImplementedError ! # Our MAPI implementation import warnings *************** *** 92,96 **** cwd = os.getcwd() mapi.MAPIInitialize(None) ! logonFlags = mapi.MAPI_NO_MAIL | mapi.MAPI_EXTENDED | mapi.MAPI_USE_DEFAULT self.session = mapi.MAPILogonEx(0, None, None, logonFlags) self._FindDefaultMessageStore() --- 92,96 ---- cwd = os.getcwd() mapi.MAPIInitialize(None) ! logonFlags = mapi.MAPI_NO_MAIL | mapi.MAPI_EXTENDED | mapi.MAPI_USE_DEFAULT self.session = mapi.MAPILogonEx(0, None, None, logonFlags) self._FindDefaultMessageStore() *************** *** 124,128 **** for store_folder in self._GetSubFolderIter(folder): yield store_folder ! def GetFolderGenerator(self, folder_ids, include_sub): for folder_id in folder_ids: --- 124,128 ---- for store_folder in self._GetSubFolderIter(folder): yield store_folder ! def GetFolderGenerator(self, folder_ids, include_sub): for folder_id in folder_ids: *************** *** 153,157 **** unread = data[1][1] folder = MAPIMsgStoreFolder(self, folder_eid, "Unknown - temp message", -1) ! return MAPIMsgStoreMsg(self, folder, message_id, unread) ## # Currently no need for this --- 153,157 ---- unread = data[1][1] folder = MAPIMsgStoreFolder(self, folder_eid, "Unknown - temp message", -1) ! return MAPIMsgStoreMsg(self, folder, message_id, unread) ## # Currently no need for this *************** *** 313,317 **** source_folder.CopyMessages( (self.id,), None, dest_folder, 0, None, flags) self.folder = self.msgstore.GetFolder(mapi.HexFromBin(folder.id)) ! def MoveTo(self, folder): self._DoCopyMode(folder, True) --- 313,317 ---- source_folder.CopyMessages( (self.id,), None, dest_folder, 0, None, flags) self.folder = self.msgstore.GetFolder(mapi.HexFromBin(folder.id)) ! def MoveTo(self, folder): self._DoCopyMode(folder, True) *************** *** 331,337 **** print msg store.Close() ! if __name__=='__main__': test() - --- 331,336 ---- print msg store.Close() ! if __name__=='__main__': test() Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** train.py 24 Oct 2002 13:06:39 -0000 1.6 --- train.py 25 Oct 2002 01:25:56 -0000 1.7 *************** *** 34,38 **** for f in mgr.message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub): num_msgs += f.count ! progress.set_max_ticks(num_msgs+3) --- 34,38 ---- for f in mgr.message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub): num_msgs += f.count ! progress.set_max_ticks(num_msgs+3) From tim_one@users.sourceforge.net Fri Oct 25 02:23:19 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 24 Oct 2002 18:23:19 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.14,1.15 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv12282 Modified Files: addin.py Log Message: ShowClues(): cgi.escape was already imported earlier in the function. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** addin.py 24 Oct 2002 13:06:39 -0000 1.14 --- addin.py 25 Oct 2002 01:23:17 -0000 1.15 *************** *** 158,163 **** push("
\n")
      txt = msgstore_message.GetEmailPackageObject().as_string(unixfrom=1)
!     import cgi
!     push(cgi.escape(txt, True))
      push("
\n") body = ''.join(body) --- 158,162 ---- push("
\n")
      txt = msgstore_message.GetEmailPackageObject().as_string(unixfrom=1)
!     push(escape(txt, True))
      push("
\n") body = ''.join(body) From mhammond@users.sourceforge.net Fri Oct 25 02:16:23 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Thu, 24 Oct 2002 18:16:23 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 msgstore.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv7277 Modified Files: msgstore.py Log Message: Was trying to be too clever :( Python's MAPI didn't do the Py_True/Py_False thing, hence MAPI bools came back as 0/-1. Fixed in win32all sources, but this makes it work everywhere. Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** msgstore.py 25 Oct 2002 00:52:57 -0000 1.3 --- msgstore.py 25 Oct 2002 01:16:21 -0000 1.4 *************** *** 208,212 **** def __repr__(self): ! urs = ["read", "unread"][self.unread] return "<%s, (%s) id=%s>" % (self.__class__.__name__, urs, mapi.HexFromBin(self.id)) --- 208,215 ---- def __repr__(self): ! if self.unread: ! urs = "read" ! else: ! urs = "unread" return "<%s, (%s) id=%s>" % (self.__class__.__name__, urs, mapi.HexFromBin(self.id)) From tim_one@users.sourceforge.net Thu Oct 24 22:52:59 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 24 Oct 2002 14:52:59 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 msgstore.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv9626 Modified Files: msgstore.py Log Message: Can't suppress FutureWarning unless using a version of Python in which it exists. Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** msgstore.py 24 Oct 2002 13:06:39 -0000 1.1 --- msgstore.py 24 Oct 2002 21:52:57 -0000 1.2 *************** *** 72,76 **** # Our MAPI implementation import warnings ! warnings.filterwarnings("ignore", category=FutureWarning, append=1) from win32com.client import Dispatch, constants --- 72,78 ---- # Our MAPI implementation import warnings ! if sys.version_info >= (2, 3): ! # sick off the new hex() warnings! ! warnings.filterwarnings("ignore", category=FutureWarning, append=1) from win32com.client import Dispatch, constants *************** *** 307,309 **** if __name__=='__main__': test() ! \ No newline at end of file --- 309,311 ---- if __name__=='__main__': test() ! From mhammond@users.sourceforge.net Fri Oct 25 16:20:29 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Fri, 25 Oct 2002 08:20:29 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs TrainingDialog.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv10800/dialogs Modified Files: TrainingDialog.py Log Message: First step towards incremental training - keep a "message database" (simply a pickled dictionary of [messageid]=is_spam). Train dialog now has a "rebuild" checkbox, defaulting to off - if off, only messages not already in the database are added (and correctly untrained if previously in the incorrect category) This change will force you to do a full retrain - sorry. Index: TrainingDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/TrainingDialog.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** TrainingDialog.py 25 Oct 2002 06:58:20 -0000 1.4 --- TrainingDialog.py 25 Oct 2002 15:20:26 -0000 1.5 *************** *** 20,23 **** --- 20,24 ---- IDC_STATIC_SPAM = 1003 IDC_BROWSE_SPAM = 1004 + IDC_BUT_REBUILD = 1005 from AsyncDialog import IDC_START, IDC_PROGRESS, IDC_PROGRESS_TEXT, AsyncDialogBase *************** *** 32,36 **** dt = [ # Dialog itself. ! ["Training", (0, 0, 241, 118), style, None, (8, "MS Sans Serif")], # Children [STATIC, ham_title, -1, ( 7, 6, 131, 11), cs ], --- 33,37 ---- dt = [ # Dialog itself. ! ["Training", (0, 0, 241, 130), style, None, (8, "MS Sans Serif")], # Children [STATIC, ham_title, -1, ( 7, 6, 131, 11), cs ], *************** *** 41,53 **** [STATIC, "", IDC_STATIC_SPAM, ( 7, 47, 167, 12), cs | win32con.SS_SUNKEN | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE], [BUTTON, 'Brow&se', IDC_BROWSE_SPAM, (184, 47, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], ! [BUTTON, process_start_text, IDC_START, ( 7, 97, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], ! ["msctls_progress32", '', IDC_PROGRESS, ( 7, 68, 166, 11), cs | win32con.WS_BORDER], ! [STATIC, '', IDC_PROGRESS_TEXT, ( 7, 84, 227, 10), cs ], ! [BUTTON, 'Close', win32con.IDOK, (184, 97, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ] ! disable_while_running_ids = [IDC_BROWSE_HAM, IDC_BROWSE_SPAM, win32con.IDOK] def __init__ (self, mgr, trainer): --- 42,55 ---- [STATIC, "", IDC_STATIC_SPAM, ( 7, 47, 167, 12), cs | win32con.SS_SUNKEN | win32con.SS_LEFTNOWORDWRAP | win32con.SS_CENTERIMAGE], [BUTTON, 'Brow&se', IDC_BROWSE_SPAM, (184, 47, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], + [BUTTON, 'Rebuild entire database',IDC_BUT_REBUILD, ( 7, 67, 174, 10), cs | win32con.BS_AUTOCHECKBOX | win32con.WS_TABSTOP], ! [BUTTON, process_start_text, IDC_START, ( 7, 109, 50, 14), cs | win32con.BS_DEFPUSHBUTTON | win32con.WS_TABSTOP], ! ["msctls_progress32", '', IDC_PROGRESS, ( 7, 82, 166, 11), cs | win32con.WS_BORDER], ! [STATIC, '', IDC_PROGRESS_TEXT, ( 7, 98, 227, 10), cs ], ! [BUTTON, 'Close', win32con.IDOK, (184, 109, 50, 14), cs | win32con.BS_PUSHBUTTON | win32con.WS_TABSTOP], ] ! disable_while_running_ids = [IDC_BROWSE_HAM, IDC_BROWSE_SPAM, IDC_BUT_REBUILD, win32con.IDOK] def __init__ (self, mgr, trainer): *************** *** 98,105 **** self.UpdateStatus() def _DoProcess(self): self.mgr.WorkerThreadStarting() try: ! self.trainer(self.mgr, self.progress) finally: self.mgr.WorkerThreadEnding() --- 100,111 ---- self.UpdateStatus() + def StartProcess(self): + self.rebuild = self.GetDlgItem(IDC_BUT_REBUILD).GetCheck() != 0 + return AsyncDialogBase.StartProcess(self) + def _DoProcess(self): self.mgr.WorkerThreadStarting() try: ! self.trainer(self.mgr, self.progress, self.rebuild) finally: self.mgr.WorkerThreadEnding() From mhammond@users.sourceforge.net Fri Oct 25 16:20:30 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Fri, 25 Oct 2002 08:20:30 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.22,1.23 train.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv10800 Modified Files: manager.py train.py Log Message: First step towards incremental training - keep a "message database" (simply a pickled dictionary of [messageid]=is_spam). Train dialog now has a "rebuild" checkbox, defaulting to off - if off, only messages not already in the database are added (and correctly untrained if previously in the incorrect category) This change will force you to do a full retrain - sorry. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** manager.py 25 Oct 2002 13:12:19 -0000 1.22 --- manager.py 25 Oct 2002 15:20:25 -0000 1.23 *************** *** 52,55 **** --- 52,56 ---- self.ini_filename = config_base + "_bayes_customize.ini" self.bayes_filename = config_base + "_bayes_database.pck" + self.message_db_filename = config_base + "_message_database.pck" self.config_filename = config_base + "_configuration.pck" *************** *** 76,80 **** "database '%s' can be opened or created" % ( self.ini_filename, self.bayes_filename)) ! bayes = None try: bayes = cPickle.load(open(self.bayes_filename, 'rb')) --- 77,81 ---- "database '%s' can be opened or created" % ( self.ini_filename, self.bayes_filename)) ! bayes = message_db = None try: bayes = cPickle.load(open(self.bayes_filename, 'rb')) *************** *** 86,96 **** import traceback traceback.print_exc() ! if bayes is None: self.InitNewBayes() bayes = self.bayes if self.verbose: print ("Bayes database initialized with " "%d spam and %d good messages" % (bayes.nspam, bayes.nham)) self.bayes = bayes self.bayes_dirty = False --- 87,112 ---- import traceback traceback.print_exc() ! try: ! message_db = cPickle.load(open(self.message_db_filename, 'rb')) ! print "Loaded message database from '%s'" % (self.message_db_filename,) ! except IOError: ! pass ! except: ! print "Failed to load bayes message database" ! import traceback ! traceback.print_exc() ! if bayes is None or message_db is None: ! print "Either bayes database or message database is missing - creating new" self.InitNewBayes() bayes = self.bayes + message_db = self.message_db if self.verbose: print ("Bayes database initialized with " "%d spam and %d good messages" % (bayes.nspam, bayes.nham)) + if len(message_db) != bayes.nham + bayes.nspam: + print "*** - message database only has %d messages - bayes has %d - something is screwey" % \ + (len(message_db), bayes.nham + bayes.nspam) self.bayes = bayes + self.message_db = message_db self.bayes_dirty = False *************** *** 125,128 **** --- 141,145 ---- def InitNewBayes(self): self.bayes = bayes_classifier.Bayes() + self.message_db = {} # OK, so its not quite a DB yet self.bayes_dirty = True *************** *** 134,137 **** --- 151,158 ---- print " ->", self.bayes_filename cPickle.dump(bayes, open(self.bayes_filename,"wb"), 1) + if self.verbose: + print " ->", self.message_db_filename + cPickle.dump(self.message_db, open(self.message_db_filename,"wb"), 1) + self.bayes_dirty = False def SaveConfig(self): *************** *** 146,150 **** if self.bayes_dirty: self.SaveBayes() - self.bayes_dirty = False else: print "Bayes database is not dirty - not writing" --- 167,170 ---- Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** train.py 25 Oct 2002 13:10:15 -0000 1.8 --- train.py 25 Oct 2002 15:20:25 -0000 1.9 *************** *** 6,12 **** import sys, os, traceback ! def train_folder( f, isspam, mgr, progress): from tokenizer import tokenize ! num = 0 for message in f.GetMessageGenerator(): if progress.stop_requested(): --- 6,36 ---- import sys, os, traceback ! def train_message(msg, is_spam, mgr): ! # Train an individual message. ! # Returns True if newly added (message will be correctly ! # untrained if it was in the wrong category), False if already ! # in the correct category. Catch your own damn exceptions. from tokenizer import tokenize ! stream = msg.GetEmailPackageObject() ! tokens = tokenize(stream) ! # Handle we may have already been trained. ! was_spam = mgr.message_db.get(msg.id) ! if was_spam is None: ! # never previously trained. ! pass ! elif was_spam == is_spam: ! # Already in DB - do nothing (full retrain will wipe msg db) ! # leave now. ! return False ! else: ! mgr.bayes.unlearn(tokens, was_spam, False) ! # OK - setup the new data. ! mgr.bayes.learn(tokens, is_spam, False) ! mgr.message_db[msg.id] = is_spam ! mgr.bayes_dirty = True ! return True ! ! def train_folder( f, isspam, mgr, progress): ! num = num_added = 0 for message in f.GetMessageGenerator(): if progress.stop_requested(): *************** *** 14,29 **** progress.tick() try: ! stream = message.GetEmailPackageObject() ! mgr.bayes.learn(tokenize(stream), isspam, False) except: print "Error training message '%s'" % (message,) traceback.print_exc() num += 1 ! print "Trained over", num, "in folder", f.name # Called back from the dialog to do the actual training. ! def trainer(mgr, progress): config = mgr.config ! mgr.InitNewBayes() bayes = mgr.bayes --- 38,54 ---- progress.tick() try: ! if train_message(message, isspam, mgr): ! num_added += 1 except: print "Error training message '%s'" % (message,) traceback.print_exc() num += 1 ! print "Checked", num, "in folder", f.name, "-", num_added, "new entries found." # Called back from the dialog to do the actual training. ! def trainer(mgr, progress, rebuild): config = mgr.config ! if rebuild: ! mgr.InitNewBayes() bayes = mgr.bayes *************** *** 59,63 **** if progress.stop_requested(): return - mgr.bayes_dirty = True progress.set_status("Completed training with %d spam and %d good messages" % (bayes.nspam, bayes.nham)) --- 84,87 ---- From tim.one@comcast.net Fri Oct 25 17:31:20 2002 From: tim.one@comcast.net (Tim Peters) Date: Fri, 25 Oct 2002 12:31:20 -0400 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.22,1.23 train.py,1.8,1.9 In-Reply-To: Message-ID: [Mark Hammond] > First step towards incremental training - keep a "message database" > (simply a pickled dictionary of [messageid]=is_spam). Train > dialog now has a "rebuild" checkbox, defaulting to off - if off, > only messages not already in the database are added (and correctly > untrained if previously in the incorrect category) > > This change will force you to do a full retrain - sorry. Don't apologize -- this is pre-alpha code, and it's a great idea! Besides, you sped this up so much yesterday than anyone whining will be shot . From tim_one@users.sourceforge.net Fri Oct 25 17:34:30 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 25 Oct 2002 09:34:30 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 default_bayes_customize.ini,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv12343/Outlook2000 Modified Files: default_bayes_customize.ini Log Message: Added new tokenizer option replace_nonascii_chars, false by default in the core project, BUT TRUE BY DEFAULT IN THE OUTLOOK 2000 CLIENT! Yanks and Aussies who don't normally correspond in Korean should find this more effective with less training and less database burden at nailing Asian spam. Index: default_bayes_customize.ini =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/default_bayes_customize.ini,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** default_bayes_customize.ini 24 Oct 2002 04:58:52 -0000 1.2 --- default_bayes_customize.ini 25 Oct 2002 16:34:23 -0000 1.3 *************** *** 3,6 **** --- 3,17 ---- # As we decide default options, we can add them! + [Tokenizer] + # So long as Mark, Sean, and I are the primary users of this (i.e., + # Americans and Australians), this non-default option is very effective + # at nailing Asian spam with little training and small database burden. + # It should probably be exposed via the GUI, as it's not appropriate + # for people who get "high-bit ham". Asian spam is nailed with this + # False too, but it requires more training and a larger database, since + # a sufficient variety of "8bit%" and "skip" metatokens take longer to + # learn about than strings of question marks. + replace_nonascii_chars: True + [Classifier] #use_chi_squared_combining: True From tim_one@users.sourceforge.net Fri Oct 25 17:35:00 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 25 Oct 2002 09:35:00 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.53,1.54 tokenizer.py,1.47,1.48 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv12343 Modified Files: Options.py tokenizer.py Log Message: Added new tokenizer option replace_nonascii_chars, false by default in the core project, BUT TRUE BY DEFAULT IN THE OUTLOOK 2000 CLIENT! Yanks and Aussies who don't normally correspond in Korean should find this more effective with less training and less database burden at nailing Asian spam. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** Options.py 18 Oct 2002 21:38:16 -0000 1.53 --- Options.py 25 Oct 2002 16:34:16 -0000 1.54 *************** *** 100,103 **** --- 100,111 ---- generate_long_skips: True + # If true, replace high-bit characters (ord(c) >= 128) and control characters + # with question marks. This allows non-ASCII character strings to be + # identified with little training and small database burden. It's appropriate + # only if your ham is plain 7-bit ASCII, or nearly so, so that the mere + # presence of non-ASCII character strings is known in advance to be a strong + # spam indicator. + replace_nonascii_chars: False + [TestDriver] # These control various displays in class TestDriver.Driver, and Tester.Test. *************** *** 279,282 **** --- 287,291 ---- 'basic_header_tokenize_only': boolean_cracker, 'basic_header_skip': ('get', lambda s: Set(s.split())), + 'replace_nonascii_chars': boolean_cracker, }, 'TestDriver': {'nbuckets': int_cracker, Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** tokenizer.py 22 Oct 2002 01:37:53 -0000 1.47 --- tokenizer.py 25 Oct 2002 16:34:19 -0000 1.48 *************** *** 739,742 **** --- 739,757 ---- # total unique fn went from 168 to 169 + # For support of the replace_nonascii_chars option, build a string.translate + # table that maps all high-bit chars and control chars to a '?' character. + + non_ascii_translate_tab = ['?'] * 256 + # leave blank up to (but not including) DEL alone + for i in range(32, 127): + non_ascii_translate_tab[i] = chr(i) + # leave "normal" whitespace alone + for ch in ' \t\r\n': + non_ascii_translate_tab[ord(ch)] = ch + del i, ch + + non_ascii_translate_tab = ''.join(non_ascii_translate_tab) + + def crack_content_xyz(msg): yield 'content-type:' + msg.get_content_type() *************** *** 1002,1006 **** yield 'received:' + tok ! # Message-Id: This seems to be a small win and should no # adversely affect a mixed source corpus so it's always enabled. msgid = msg.get("message-id", "") --- 1017,1021 ---- yield 'received:' + tok ! # Message-Id: This seems to be a small win and should not # adversely affect a mixed source corpus so it's always enabled. msgid = msg.get("message-id", "") *************** *** 1077,1080 **** --- 1092,1099 ---- for t in tokens: yield t + + if options.replace_nonascii_chars: + # Replace high-bit chars and control chars with '?'. + text = text.translate(non_ascii_translate_tab) # Special tagging of embedded URLs. From mhammond@users.sourceforge.net Fri Oct 25 14:12:22 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Fri, 25 Oct 2002 06:12:22 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.21,1.22 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv5055 Modified Files: manager.py Log Message: Remove CDO remnant. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.21 retrieving revision 1.22 diff -C2 -d -r1.21 -r1.22 *** manager.py 25 Oct 2002 13:10:14 -0000 1.21 --- manager.py 25 Oct 2002 13:12:19 -0000 1.22 *************** *** 39,46 **** bayes_tokenize = tokenize - # Suck in CDO type lib - win32com.client.gencache.EnsureModule('{3FA7DEA7-6438-101B-ACC1-00AA00423326}', - 0, 1, 21, bForDemand=True) - class ManagerError(Exception): pass --- 39,42 ---- From mhammond@users.sourceforge.net Fri Oct 25 07:58:22 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Thu, 24 Oct 2002 23:58:22 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.16,1.17 manager.py,1.19,1.20 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv11097 Modified Files: addin.py manager.py Log Message: Remove the rest of CDO/MAPI - we no longer need this for *anything* Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** addin.py 25 Oct 2002 03:35:09 -0000 1.16 --- addin.py 25 Oct 2002 06:58:20 -0000 1.17 *************** *** 34,38 **** gencache.EnsureModule('{00062FFF-0000-0000-C000-000000000046}', 0, 9, 0, bForDemand=True) # Outlook 9 gencache.EnsureModule('{2DF8D04C-5BFA-101B-BDE5-00AA0044DE52}', 0, 2, 1, bForDemand=True) # Office 9 - gencache.EnsureModule('{3FA7DEA7-6438-101B-ACC1-00AA00423326}', 0, 1, 21, bForDemand = True) # CDO # The TLB defiining the interfaces we implement --- 34,37 ---- *************** *** 42,65 **** # Error when not running as a script - eeek - just let it go. raise ! try: ! pythoncom.MakeIID("MAPI.Session") ! have_cdo = True ! except pythoncom.com_error: ! have_cdo = False ! print "This Addin requires that Outlook 2000 with CDO be installed on this machine." print ! if have_cdo: ! print "However, these appear to be installed. Error details:" ! print "COM Error 0x%x (%s)" % (hr, msg) ! if exc: ! print "Exception: %s" % (exc) ! print ! print "Sorry, I can't be more help, but I can't continue while I have this error." ! else: ! print "CDO is not currently installed. To install CDO, you must locate the" ! print "media from which you installed Outlook (such as Office 2000 CD or " ! print "sharepoint), re-run setup, select Outlook, enable CDO." ! print ! print "Please install CDO then attempt this registration again." sys.exit(1) --- 41,51 ---- # Error when not running as a script - eeek - just let it go. raise ! print "This Addin requires that Outlook 2000 be installed on this machine." print ! print "This appears to not be installed due to the following error:" ! print "COM Error 0x%x (%s)" % (hr, msg) ! if exc: ! print "Exception: %s" % (exc) ! print "Sorry, I can't be more help, but I can't continue while I have this error." sys.exit(1) Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** manager.py 25 Oct 2002 01:31:42 -0000 1.19 --- manager.py 25 Oct 2002 06:58:20 -0000 1.20 *************** *** 62,70 **** self.config = self.LoadConfig() - cwd = os.getcwd() - self.mapi = win32com.client.Dispatch("MAPI.Session") - self.mapi.Logon(None, None, False, False) self.outlook = outlook - os.chdir(cwd) import_core_spambayes_stuff(self.ini_filename) --- 62,66 ---- *************** *** 165,171 **** def Close(self): - if self.mapi is not None: - self.mapi.Logoff() - self.mapi = None if self.bayes_dirty and self.bayes: print "Warning: BayesManager closed while Bayes database dirty" --- 161,164 ---- From mhammond@users.sourceforge.net Fri Oct 25 07:58:22 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Thu, 24 Oct 2002 23:58:22 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs ClassifyDialog.py,1.4,1.5 FilterDialog.py,1.4,1.5 FolderSelector.py,1.2,1.3 ManagerDialog.py,1.2,1.3 RuleDialog.py,1.3,1.4TrainingDialog.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv11097/dialogs Modified Files: ClassifyDialog.py FilterDialog.py FolderSelector.py ManagerDialog.py RuleDialog.py TrainingDialog.py Log Message: Remove the rest of CDO/MAPI - we no longer need this for *anything* Index: ClassifyDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/ClassifyDialog.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** ClassifyDialog.py 24 Oct 2002 13:06:39 -0000 1.4 --- ClassifyDialog.py 25 Oct 2002 06:58:20 -0000 1.5 *************** *** 54,58 **** self.classifier = classifier self.config = mgr.config.classify - self.mapi = mgr.mapi self.mgr = mgr AsyncDialogBase.__init__ (self, self.dt) --- 54,57 ---- *************** *** 66,78 **** def UpdateStatus(self): names = [] - cwd = os.getcwd() # mapi.GetFolder() switches to the system MAPI dir for eid in self.config.folder_ids: try: ! name = self.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" names.append(name) self.SetDlgItemText(IDC_STATIC_FOLDERS, "; ".join(names)) - os.chdir(cwd) def OnBrowse(self, id, code): --- 65,75 ---- def UpdateStatus(self): names = [] for eid in self.config.folder_ids: try: ! name = self.mgr.message_store.GetFolder(eid).name except pythoncom.com_error: name = "" names.append(name) self.SetDlgItemText(IDC_STATIC_FOLDERS, "; ".join(names)) def OnBrowse(self, id, code): *************** *** 80,84 **** import FolderSelector l = self.config.folder_ids ! d = FolderSelector.FolderSelector(self.mapi, l,checkbox_state=self.config.include_sub) if d.DoModal()==win32con.IDOK: l[:], self.config.include_sub = d.GetSelectedIDs()[:] --- 77,81 ---- import FolderSelector l = self.config.folder_ids ! d = FolderSelector.FolderSelector(self.mgr.message_store.session, l,checkbox_state=self.config.include_sub) if d.DoModal()==win32con.IDOK: l[:], self.config.include_sub = d.GetSelectedIDs()[:] Index: FilterDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FilterDialog.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** FilterDialog.py 20 Oct 2002 18:53:06 -0000 1.4 --- FilterDialog.py 25 Oct 2002 06:58:20 -0000 1.5 *************** *** 261,265 **** for eid in folder_ids: try: ! name = self.mgr.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" --- 261,265 ---- for eid in folder_ids: try: ! name = self.mgr.message_store.GetFolder(eid).name except pythoncom.com_error: name = "" *************** *** 271,275 **** import FolderSelector filter = self.mgr.config.filter ! d = FolderSelector.FolderSelector(self.mgr.mapi, filter.folder_ids,checkbox_state=filter.include_sub) if d.DoModal()==win32con.IDOK: filter.folder_ids, filter.include_sub = d.GetSelectedIDs() --- 271,275 ---- import FolderSelector filter = self.mgr.config.filter ! d = FolderSelector.FolderSelector(self.mgr.message_store.session, filter.folder_ids,checkbox_state=filter.include_sub) if d.DoModal()==win32con.IDOK: filter.folder_ids, filter.include_sub = d.GetSelectedIDs() *************** *** 334,338 **** for eid in self.mgr.config.filter_now.folder_ids: try: ! name = self.mgr.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" --- 334,338 ---- for eid in self.mgr.config.filter_now.folder_ids: try: ! name = self.mgr.message_store.GetFolder(eid).name except pythoncom.com_error: name = "" *************** *** 344,348 **** import FolderSelector filter = self.mgr.config.filter_now ! d = FolderSelector.FolderSelector(self.mgr.mapi, filter.folder_ids,checkbox_state=filter.include_sub) if d.DoModal()==win32con.IDOK: filter.folder_ids, filter.include_sub = d.GetSelectedIDs() --- 344,348 ---- import FolderSelector filter = self.mgr.config.filter_now ! d = FolderSelector.FolderSelector(self.mgr.message_store.session, filter.folder_ids,checkbox_state=filter.include_sub) if d.DoModal()==win32con.IDOK: filter.folder_ids, filter.include_sub = d.GetSelectedIDs() *************** *** 368,371 **** --- 368,372 ---- if __name__=='__main__': + # This doesnt work - still uses CDO. from win32com.client import Dispatch import pythoncom Index: FolderSelector.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FolderSelector.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** FolderSelector.py 19 Oct 2002 18:14:01 -0000 1.2 --- FolderSelector.py 25 Oct 2002 06:58:20 -0000 1.3 *************** *** 11,22 **** # Helpers for building the folder list class FolderSpec: ! def __init__(self, folder, name = None): ! if name is None: ! self.name = folder.Name ! else: ! self.name = name ! self.name = self.name.encode("ascii", "replace") self.children = [] - self.folder = folder def dump(self, level=0): --- 11,18 ---- # Helpers for building the folder list class FolderSpec: ! def __init__(self, folder_id, name): ! self.folder_id = folder_id ! self.name = name self.children = [] def dump(self, level=0): *************** *** 26,40 **** c.dump(level+1) ! def _BuildFolders(folders): children = [] folder = folders.GetFirst() while folder: ! spec = FolderSpec(folder) ! spec.children = _BuildFolders(folder.Folders) children.append(spec) folder = folders.GetNext() return children ! def BuildFolderTree(session): infostores = session.InfoStores root = FolderSpec(None, "root") --- 22,39 ---- c.dump(level+1) ! ######################################################################### ! ## CDO version of a folder walker. ! ######################################################################### ! def _BuildFoldersCDO(folders): children = [] folder = folders.GetFirst() while folder: ! spec = FolderSpec(folder.ID, folder.Name.encode("mbcs", "replace")) ! spec.children = _BuildFoldersCDO(folder.Folders) children.append(spec) folder = folders.GetNext() return children ! def BuildFolderTreeCDO(session): infostores = session.InfoStores root = FolderSpec(None, "root") *************** *** 43,53 **** rootFolder = infostore.RootFolder folders = rootFolder.Folders ! spec = FolderSpec(rootFolder, infostore.Name) ! spec.children = _BuildFolders(folders) root.children.append(spec) return root ! # ! # The dialog itself # IDs for controls we use. --- 42,87 ---- rootFolder = infostore.RootFolder folders = rootFolder.Folders ! spec = FolderSpec(rootFolder.ID, infostore.Name.encode("mbcs", "replace")) ! spec.children = _BuildFoldersCDO(folders) ! root.children.append(spec) ! return root ! ! ######################################################################### ! ## An extended MAPI version ! ######################################################################### ! from win32com.mapi import mapi ! from win32com.mapi.mapitags import * ! ! def _BuildFoldersMAPI(msgstore, folder): ! # Get the hierarchy table for it. ! table = folder.GetHierarchyTable(0) ! children = [] ! rows = mapi.HrQueryAllRows(table, (PR_ENTRYID,PR_DISPLAY_NAME_A), None, None, 0) ! for (eid_tag, eid),(name_tag, name) in rows: ! spec = FolderSpec(mapi.HexFromBin(eid), name) ! child_folder = msgstore.OpenEntry(eid, None, mapi.MAPI_DEFERRED_ERRORS) ! spec.children = _BuildFoldersMAPI(msgstore, child_folder) ! children.append(spec) ! return children ! ! def BuildFolderTreeMAPI(session): ! root = FolderSpec(None, "root") ! tab = session.GetMsgStoresTable(0) ! rows = mapi.HrQueryAllRows(tab, (PR_ENTRYID, PR_DISPLAY_NAME_A), None, None, 0) ! for row in rows: ! (eid_tag, eid), (name_tag, name) = row ! msgstore = session.OpenMsgStore(0, eid, None, mapi.MDB_NO_MAIL | mapi.MAPI_DEFERRED_ERRORS) ! hr, data = msgstore.GetProps( ( PR_IPM_SUBTREE_ENTRYID,), 0) ! subtree_eid = data[0][1] ! folder = msgstore.OpenEntry(subtree_eid, None, mapi.MAPI_DEFERRED_ERRORS) ! spec = FolderSpec(mapi.HexFromBin(subtree_eid), name) ! spec.children = _BuildFoldersMAPI(msgstore, folder) root.children.append(spec) return root ! ! ######################################################################### ! ## The dialog itself ! ######################################################################### # IDs for controls we use. *************** *** 110,114 **** mask = state = 0 else: ! if self.selected_ids and child.folder.ID in self.selected_ids: state = INDEXTOSTATEIMAGEMASK(IIL_CHECKED) num_children_selected += 1 --- 144,148 ---- mask = state = 0 else: ! if self.selected_ids and child.folder_id in self.selected_ids: state = INDEXTOSTATEIMAGEMASK(IIL_CHECKED) num_children_selected += 1 *************** *** 118,122 **** item_id = self._MakeItemParam(child) hitem = self.list.InsertItem(hParent, 0, (None, state, mask, text, bitmapCol, bitmapSel, cItems, item_id)) ! if self.single_select and self.selected_ids and child.folder.ID in self.selected_ids: self.list.SelectItem(hitem) --- 152,156 ---- item_id = self._MakeItemParam(child) hitem = self.list.InsertItem(hParent, 0, (None, state, mask, text, bitmapCol, bitmapSel, cItems, item_id)) ! if self.single_select and self.selected_ids and child.folder_id in self.selected_ids: self.list.SelectItem(hitem) *************** *** 193,197 **** self.GetDlgItem(IDC_BUTTON_CLEARALL).ShowWindow(win32con.SW_HIDE) ! tree = BuildFolderTree(self.mapi) self._InsertSubFolders(0, tree) self.selected_ids = [] # wipe this out while we are alive. --- 227,236 ---- self.GetDlgItem(IDC_BUTTON_CLEARALL).ShowWindow(win32con.SW_HIDE) ! if hasattr(self.mapi, "_oleobj_"): # Dispatch COM object ! # CDO ! tree = BuildFolderTreeCDO(self.mapi) ! else: ! # Extended MAPI. ! tree = BuildFolderTreeMAPI(self.mapi) self._InsertSubFolders(0, tree) self.selected_ids = [] # wipe this out while we are alive. *************** *** 269,281 **** ret = [] for info, spec in self._YieldCheckedChildren(): ! ret.append(spec.folder.ID) return ret, self.GetDlgItem(IDC_BUTTON_SEARCHSUB).GetCheck() != 0 ! def TestWithMAPI(): from win32com.client import Dispatch mapi = Dispatch("MAPI.Session") mapi.Logon("", "", False, False) ids = [u'0000000071C4408983B0B24F8863EE66A8F79AFF82800000'] ! d=FolderSelector(mapi, ids, single_select = True) d.DoModal() print d.GetSelectedIDs() --- 308,329 ---- ret = [] for info, spec in self._YieldCheckedChildren(): ! ret.append(spec.folder_id) return ret, self.GetDlgItem(IDC_BUTTON_SEARCHSUB).GetCheck() != 0 ! def TestWithCDO(): from win32com.client import Dispatch mapi = Dispatch("MAPI.Session") mapi.Logon("", "", False, False) ids = [u'0000000071C4408983B0B24F8863EE66A8F79AFF82800000'] ! d=FolderSelector(mapi, ids, single_select = False) ! d.DoModal() ! print d.GetSelectedIDs() ! ! def TestWithMAPI(): ! mapi.MAPIInitialize(None) ! logonFlags = mapi.MAPI_NO_MAIL | mapi.MAPI_EXTENDED | mapi.MAPI_USE_DEFAULT ! session = mapi.MAPILogonEx(0, None, None, logonFlags) ! ids = [u'0000000071C4408983B0B24F8863EE66A8F79AFF82800000'] ! d=FolderSelector(session, ids, single_select = False) d.DoModal() print d.GetSelectedIDs() Index: ManagerDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/ManagerDialog.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** ManagerDialog.py 20 Oct 2002 18:53:06 -0000 1.2 --- ManagerDialog.py 25 Oct 2002 06:58:20 -0000 1.3 *************** *** 89,93 **** names = [] for eid in self.mgr.config.filter.folder_ids: ! names.append(self.mgr.mapi.GetFolder(eid).Name.encode("ascii", "replace")) # count enabled rules num = len([r for r in self.mgr.config.rules if r.enabled ]) --- 89,93 ---- names = [] for eid in self.mgr.config.filter.folder_ids: ! names.append(self.mgr.message_store.GetFolder(eid).name) # count enabled rules num = len([r for r in self.mgr.config.rules if r.enabled ]) Index: RuleDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/RuleDialog.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** RuleDialog.py 19 Oct 2002 22:30:03 -0000 1.3 --- RuleDialog.py 25 Oct 2002 06:58:20 -0000 1.4 *************** *** 95,102 **** if not self.folder_id: name = "" - elif self.mgr.mapi is None: - name = "" else: ! name = self.mgr.mapi.GetFolder(self.folder_id).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" --- 95,100 ---- if not self.folder_id: name = "" else: ! name = self.mgr.message_store.GetFolder(self.folder_id).name except pythoncom.com_error: name = "" *************** *** 122,126 **** import FolderSelector ids = [self.folder_id] ! d = FolderSelector.FolderSelector(self.mgr.mapi, ids,single_select=True,checkbox_state=None)#, allow_multi=False) if d.DoModal()==win32con.IDOK: new_ids, cb_state = d.GetSelectedIDs() --- 120,124 ---- import FolderSelector ids = [self.folder_id] ! d = FolderSelector.FolderSelector(self.mgr.message_store.session, ids,single_select=True,checkbox_state=None)#, allow_multi=False) if d.DoModal()==win32con.IDOK: new_ids, cb_state = d.GetSelectedIDs() *************** *** 190,199 **** if __name__=='__main__': ! from win32com.client import Dispatch ! try: ! mapi = Dispatch("MAPI.Session") ! mapi.Logon() ! except pythoncom.com_error: ! mapi = None class Rule: def __init__(self): --- 188,193 ---- if __name__=='__main__': ! # This doesn't work ! class Rule: def __init__(self): Index: TrainingDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/TrainingDialog.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** TrainingDialog.py 20 Oct 2002 05:52:50 -0000 1.3 --- TrainingDialog.py 25 Oct 2002 06:58:20 -0000 1.4 *************** *** 6,9 **** --- 6,10 ---- import win32ui import win32api + import pythoncom #these are the atom numbers defined by Windows for basic dialog controls *************** *** 54,58 **** self.trainer = trainer self.config = mgr.config.training - self.mapi = mgr.mapi AsyncDialogBase.__init__ (self, self.dt) --- 55,58 ---- *************** *** 65,72 **** def UpdateStatus(self): names = [] - cwd = os.getcwd() # mapi.GetFolder() switches to the system MAPI dir for eid in self.config.ham_folder_ids: try: ! name = self.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" --- 65,71 ---- def UpdateStatus(self): names = [] for eid in self.config.ham_folder_ids: try: ! name = self.mgr.message_store.GetFolder(eid).name except pythoncom.com_error: name = "" *************** *** 77,86 **** for eid in self.config.spam_folder_ids: try: ! name = self.mapi.GetFolder(eid).Name.encode("ascii", "replace") except pythoncom.com_error: name = "" names.append(name) self.SetDlgItemText(IDC_STATIC_SPAM, "; ".join(names)) - os.chdir(cwd) def OnBrowse(self, id, code): --- 76,84 ---- for eid in self.config.spam_folder_ids: try: ! name = self.mgr.message_store.GetFolder(eid).name except pythoncom.com_error: name = "" names.append(name) self.SetDlgItemText(IDC_STATIC_SPAM, "; ".join(names)) def OnBrowse(self, id, code): *************** *** 94,98 **** sub_attr = "ham_include_sub" include_sub = getattr(self.config, sub_attr) ! d = FolderSelector.FolderSelector(self.mapi, l, checkbox_state=include_sub) if d.DoModal()==win32con.IDOK: l[:], include_sub = d.GetSelectedIDs()[:] --- 92,96 ---- sub_attr = "ham_include_sub" include_sub = getattr(self.config, sub_attr) ! d = FolderSelector.FolderSelector(self.mgr.message_store.session, l, checkbox_state=include_sub) if d.DoModal()==win32con.IDOK: l[:], include_sub = d.GetSelectedIDs()[:] From tim_one@users.sourceforge.net Fri Oct 25 05:40:44 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 24 Oct 2002 21:40:44 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 msgstore.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv20866/Outlook2000 Modified Files: msgstore.py Log Message: _GetMessageText(): + Removed dead assignment to headers. + Added note about distinguishing between received items and sent items; we should probably "do something" to weed out the latter. + Sped final catenation of headers + body + html. Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** msgstore.py 25 Oct 2002 01:25:56 -0000 1.5 --- msgstore.py 25 Oct 2002 04:40:42 -0000 1.6 *************** *** 251,262 **** # are for "forwarded" messages, where the forwards are actually # in an attachment. Later. self._EnsureObject() prop_ids = PR_TRANSPORT_MESSAGE_HEADERS_A, PR_BODY_A, MYPR_BODY_HTML_A hr, data = self.mapi_object.GetProps(prop_ids,0) - headers = data[0][1] headers = self._GetPotentiallyLargeStringProp(prop_ids[0], data[0]) body = self._GetPotentiallyLargeStringProp(prop_ids[1], data[1]) html = self._GetPotentiallyLargeStringProp(prop_ids[2], data[2]) ! return headers + "\n" + html + "\n" + body def _EnsureObject(self): --- 251,273 ---- # are for "forwarded" messages, where the forwards are actually # in an attachment. Later. + + # Note: There's no distinction made here between msgs that have + # been received, and, e.g., msgs that were sent and moved from the + # Sent Items folder. It would be good not to train on the latter, + # since it's simply not received email. An article on the web said + # the distinction can't be made with 100% certainty, but that a good + # heuristic is to believe that a msg has been received iff at least + # one of these properties has a sensible value: + # PR_RECEIVED_BY_EMAIL_ADDRESS + # PR_RECEIVED_BY_NAME + # PR_RECEIVED_BY_ENTRYID + # PR_TRANSPORT_MESSAGE_HEADERS self._EnsureObject() prop_ids = PR_TRANSPORT_MESSAGE_HEADERS_A, PR_BODY_A, MYPR_BODY_HTML_A hr, data = self.mapi_object.GetProps(prop_ids,0) headers = self._GetPotentiallyLargeStringProp(prop_ids[0], data[0]) body = self._GetPotentiallyLargeStringProp(prop_ids[1], data[1]) html = self._GetPotentiallyLargeStringProp(prop_ids[2], data[2]) ! return "%s\n%s\n%s" % (headers, html, body) def _EnsureObject(self): From mhammond@users.sourceforge.net Fri Oct 25 14:10:17 2002 From: mhammond@users.sourceforge.net (Mark Hammond) Date: Fri, 25 Oct 2002 06:10:17 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 classify.py,1.9,1.10 manager.py,1.20,1.21 train.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv1158 Modified Files: classify.py manager.py train.py Log Message: Catch all errors in the classify and training loops, and remove dead code from the manager. Index: classify.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** classify.py 24 Oct 2002 13:06:39 -0000 1.9 --- classify.py 25 Oct 2002 13:10:14 -0000 1.10 *************** *** 4,8 **** # Copyright PSF, license under the PSF license ! import sys, os, os.path, getopt, cPickle, string from win32com.client import Dispatch, constants import pythoncom --- 4,8 ---- # Copyright PSF, license under the PSF license ! import sys, os, traceback from win32com.client import Dispatch, constants import pythoncom *************** *** 14,20 **** break progress.tick() ! prob = mgr.score(message) ! message.SetField(config.field_name, prob) ! message.Save() # Called back from the dialog to do the actual training. --- 14,24 ---- break progress.tick() ! try: ! prob = mgr.score(message) ! message.SetField(config.field_name, prob) ! message.Save() ! except: ! print "Error classifying message '%s'" % (message,) ! traceback.print_exc() # Called back from the dialog to do the actual training. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** manager.py 25 Oct 2002 06:58:20 -0000 1.20 --- manager.py 25 Oct 2002 13:10:14 -0000 1.21 *************** *** 82,88 **** bayes = None try: - # Ooops - Tim did it another way - checking this in before I get more conficts! - ## from Options import options - ## options.mergefiles([self.ini_filename]) bayes = cPickle.load(open(self.bayes_filename, 'rb')) print "Loaded bayes database from '%s'" % (self.bayes_filename,) --- 82,85 ---- *************** *** 131,137 **** def InitNewBayes(self): - # Ooops - Tim did it another way - checking this in before I get more conficts! - ## from Options import options - ## options.mergefiles([self.ini_filename]) self.bayes = bayes_classifier.Bayes() self.bayes_dirty = True --- 128,131 ---- Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** train.py 25 Oct 2002 01:25:56 -0000 1.7 --- train.py 25 Oct 2002 13:10:15 -0000 1.8 *************** *** 4,8 **** # Copyright PSF, license under the PSF license ! import sys, os def train_folder( f, isspam, mgr, progress): --- 4,8 ---- # Copyright PSF, license under the PSF license ! import sys, os, traceback def train_folder( f, isspam, mgr, progress): *************** *** 13,18 **** break progress.tick() ! stream = message.GetEmailPackageObject() ! mgr.bayes.learn(tokenize(stream), isspam, False) num += 1 print "Trained over", num, "in folder", f.name --- 13,22 ---- break progress.tick() ! try: ! stream = message.GetEmailPackageObject() ! mgr.bayes.learn(tokenize(stream), isspam, False) ! except: ! print "Error training message '%s'" % (message,) ! traceback.print_exc() num += 1 print "Trained over", num, "in folder", f.name From tim_one@users.sourceforge.net Fri Oct 25 04:35:11 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Thu, 24 Oct 2002 20:35:11 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.15,1.16 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv25988/Outlook2000 Modified Files: addin.py Log Message: ShowClues(): Unix "From " lines aren't part of Outlook headers, so don't ask the email pkg to invent one. It seems always to come out as "From nobody " and there's no clue there worth getting. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** addin.py 25 Oct 2002 01:23:17 -0000 1.15 --- addin.py 25 Oct 2002 03:35:09 -0000 1.16 *************** *** 157,161 **** push("

Message Stream:


") push("
\n")
!     txt = msgstore_message.GetEmailPackageObject().as_string(unixfrom=1)
      push(escape(txt, True))
      push("
\n") --- 157,161 ---- push("

Message Stream:


") push("
\n")
!     txt = msgstore_message.GetEmailPackageObject().as_string()
      push(escape(txt, True))
      push("
\n") From tim_one@users.sourceforge.net Fri Oct 25 18:47:39 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 25 Oct 2002 10:47:39 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs RuleDialog.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv21808/Outlook2000/dialogs Modified Files: RuleDialog.py Log Message: Attempting to switch the scores from floats in 0.-1. to ints in 0-100. YOU MAY NEED TO CHANGE YOUR FILTER RULES ACCORDINGLY. The Rule dialog appears to have gotten partly broken before this -- referenced a _GetFolder method that doesn't exist. Repaired that. PROBLEM: While the Rule dialog seems to work fine now, trying to write a score to the SpamProb (Hammie, whatever) custom field doesn't appear to do anything anymore. But I didn't touch that part of the code, so I'm baffled. Perhaps it has to do with that the *type* changed from double to int, and that _MapiTypeMap can't tell the difference between an int and a bool before Python 2.3 (I'm using 2.2)? But commenting out the PT_BOOLEAN entry there didn't make any difference for me, so I doubt that's it. Index: RuleDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/RuleDialog.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** RuleDialog.py 25 Oct 2002 06:58:20 -0000 1.4 --- RuleDialog.py 25 Oct 2002 17:47:37 -0000 1.5 *************** *** 65,70 **** rule = self.rule self.SetDlgItemText(IDC_RULE_NAME, rule.name) ! self.SetDlgItemText(IDC_EDIT_LOW, "%.2f" % rule.min) ! self.SetDlgItemText(IDC_EDIT_HIGH, "%.2f" % rule.max) self.GetDlgItem(IDC_FLAG).SetCheck(rule.flag_message) self.GetDlgItem(IDC_WRITE_FIELD).SetCheck(rule.write_field) --- 65,70 ---- rule = self.rule self.SetDlgItemText(IDC_RULE_NAME, rule.name) ! self.SetDlgItemText(IDC_EDIT_LOW, "%d" % rule.min) ! self.SetDlgItemText(IDC_EDIT_HIGH, "%d" % rule.max) self.GetDlgItem(IDC_FLAG).SetCheck(rule.flag_message) self.GetDlgItem(IDC_WRITE_FIELD).SetCheck(rule.write_field) *************** *** 137,141 **** assert slider.GetSafeHwnd() == lParam idc_edit = IDC_EDIT_LOW ! self.SetDlgItemText(idc_edit, "%.2f" % (slider.GetPos() / 100.0)) def _InitSlider(self, idc_slider, idc_edit): --- 137,141 ---- assert slider.GetSafeHwnd() == lParam idc_edit = IDC_EDIT_LOW ! self.SetDlgItemText(idc_edit, "%d" % slider.GetPos()) def _InitSlider(self, idc_slider, idc_edit): *************** *** 153,165 **** except ValueError: return ! slider.SetPos(int(fval*100)) def _CheckEdit(self, idc, rule, attr): try: val = float(self.GetDlgItemText(idc)) ! if val < 0 or val > 1.0: raise ValueError except ValueError: ! self.MessageBox("Please enter a number between 0 and 1") self.GetDlgItem(idc).SetFocus() return False --- 153,165 ---- except ValueError: return ! slider.SetPos(int(fval)) def _CheckEdit(self, idc, rule, attr): try: val = float(self.GetDlgItemText(idc)) ! if val < 0 or val > 100: raise ValueError except ValueError: ! self.MessageBox("Please enter a number between 0 and 100") self.GetDlgItem(idc).SetFocus() return False *************** *** 193,198 **** def __init__(self): self.name = "My Rule" ! self.min = 0.1 ! self.max = 0.9 self.action = "Move" self.flag_message = True --- 193,198 ---- def __init__(self): self.name = "My Rule" ! self.min = 10 ! self.max = 90 self.action = "Move" self.flag_message = True From tim_one@users.sourceforge.net Fri Oct 25 18:47:39 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 25 Oct 2002 10:47:39 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.17,1.18 manager.py,1.23,1.24 msgstore.py,1.6,1.7 rule.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv21808/Outlook2000 Modified Files: addin.py manager.py msgstore.py rule.py Log Message: Attempting to switch the scores from floats in 0.-1. to ints in 0-100. YOU MAY NEED TO CHANGE YOUR FILTER RULES ACCORDINGLY. The Rule dialog appears to have gotten partly broken before this -- referenced a _GetFolder method that doesn't exist. Repaired that. PROBLEM: While the Rule dialog seems to work fine now, trying to write a score to the SpamProb (Hammie, whatever) custom field doesn't appear to do anything anymore. But I didn't touch that part of the code, so I'm baffled. Perhaps it has to do with that the *type* changed from double to int, and that _MapiTypeMap can't tell the difference between an int and a bool before Python 2.3 (I'm using 2.2)? But commenting out the PT_BOOLEAN entry there didn't make any difference for me, so I doubt that's it. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** addin.py 25 Oct 2002 06:58:20 -0000 1.17 --- addin.py 25 Oct 2002 17:47:36 -0000 1.18 *************** *** 129,135 **** msgstore_message = mgr.message_store.GetMessage(item.EntryID) ! prob, clues = mgr.score(msgstore_message, evidence=True) new_msg = app.CreateItem(0) ! body = ["

Spam Score: %g


" % prob] push = body.append # Format the clues. --- 129,135 ---- msgstore_message = mgr.message_store.GetMessage(item.EntryID) ! score, clues = mgr.score(msgstore_message, evidence=True, scale=False) new_msg = app.CreateItem(0) ! body = ["

Spam Score: %g


" % score] push = body.append # Format the clues. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.23 retrieving revision 1.24 diff -C2 -d -r1.23 -r1.24 *** manager.py 25 Oct 2002 15:20:25 -0000 1.23 --- manager.py 25 Oct 2002 17:47:36 -0000 1.24 *************** *** 179,188 **** self.message_store = None ! def score(self, msg, evidence=False): email = msg.GetEmailPackageObject() ! # As Tim suggested in email, score should move to range(100) ! # This is probably a good place to do it - anyone who wants the real ! # float value can look at the "clues" ! return self.bayes.spamprob(bayes_tokenize(email), evidence) _mgr = None --- 179,212 ---- self.message_store = None ! def score(self, msg, evidence=False, scale=True): ! """Score a msg. ! ! If optional arg evidence is specified and true, the result is a ! two-tuple ! ! score, clues ! ! where clues is a list of the (word, spamprob(word)) pairs that ! went into determining the score. Else just the score is returned. ! ! If optional arg scale is specified and false, the score is a float ! in 0.0 (ham) thru 1.0 (spam). Else (the default), the score is ! scaled into an integer from 0 (ham) thru 100 (spam). ! """ ! email = msg.GetEmailPackageObject() ! result = self.bayes.spamprob(bayes_tokenize(email), evidence) ! if not scale: ! return result ! # For sister-friendliness, multiply score by 100 and round to an int. ! if evidence: ! score, the_evidence = result ! else: ! score = result ! score = int(round(score * 100.0)) ! if evidence: ! return score, the_evidence ! else: ! return score _mgr = None Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** msgstore.py 25 Oct 2002 04:40:42 -0000 1.6 --- msgstore.py 25 Oct 2002 17:47:36 -0000 1.7 *************** *** 296,299 **** --- 296,301 ---- def SetField(self, prop, val): self._EnsureObject() + print "after ensure object" + print type(prop), prop, type(0) if type(prop)!=type(0): props = ( (mapi.PS_PUBLIC_STRINGS, prop), ) Index: rule.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/rule.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** rule.py 24 Oct 2002 13:06:39 -0000 1.5 --- rule.py 25 Oct 2002 17:47:36 -0000 1.6 *************** *** 7,12 **** self.name = "New Rule" self.enabled = True ! self.min = 0.0 ! self.max = 0.9 self.action = "None" self.flag_message = True --- 7,12 ---- self.name = "New Rule" self.enabled = True ! self.min = 30 ! self.max = 80 self.action = "None" self.flag_message = True *************** *** 27,39 **** if not self.folder_id: return "You must specify a folder for 'Move' or 'Copy'" ! if self._GetFolder(mgr) is None: return "Can not locate the destination folder" if self.write_field and not self.write_field_name: return "You must specify the field name to create" ! def Act(self, mgr, msg, prob): if mgr.verbose > 1: ! print "Rule '%s': %.2f->%.2f (%.2f) (%s)" % (self.name, self.min, self.max, prob, repr(msg)) ! if prob < self.min or prob > self.max: return False --- 27,40 ---- if not self.folder_id: return "You must specify a folder for 'Move' or 'Copy'" ! if mgr.message_store.GetFolder(self.folder_id) is None: return "Can not locate the destination folder" if self.write_field and not self.write_field_name: return "You must specify the field name to create" ! def Act(self, mgr, msg, score): if mgr.verbose > 1: ! print "Rule '%s': %d->%d (%d) (%s)" % ( ! self.name, self.min, self.max, score, repr(msg)) ! if score < self.min or score > self.max: return False *************** *** 44,48 **** if self.write_field: ! msg.SetField(self.write_field_name, prob) msg.Save() --- 45,49 ---- if self.write_field: ! msg.SetField(self.write_field_name, score) msg.Save() From tim_one@users.sourceforge.net Fri Oct 25 18:57:17 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 25 Oct 2002 10:57:17 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 msgstore.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv27837/Outlook2000 Modified Files: msgstore.py Log Message: Removed debugging prints checked in by mistake. Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** msgstore.py 25 Oct 2002 17:47:36 -0000 1.7 --- msgstore.py 25 Oct 2002 17:57:15 -0000 1.8 *************** *** 296,301 **** def SetField(self, prop, val): self._EnsureObject() - print "after ensure object" - print type(prop), prop, type(0) if type(prop)!=type(0): props = ( (mapi.PS_PUBLIC_STRINGS, prop), ) --- 296,299 ---- From tim_one@users.sourceforge.net Fri Oct 25 19:47:10 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 25 Oct 2002 11:47:10 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.24,1.25 msgstore.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv14813 Modified Files: manager.py msgstore.py Log Message: BayesManager.score(): temporarily (I hope) changed this to return a double score, as I've been unable to get an integer score to "show up" in the SpamProb (Hammie, whatever) field. MAPIMsgStoreMsg.SetField(): XXX comments detailing my woes in getting a PT_I4 property to work. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.24 retrieving revision 1.25 diff -C2 -d -r1.24 -r1.25 *** manager.py 25 Oct 2002 17:47:36 -0000 1.24 --- manager.py 25 Oct 2002 18:47:07 -0000 1.25 *************** *** 205,208 **** --- 205,216 ---- score = result score = int(round(score * 100.0)) + + # XXX If I actually return an int, the SpamProb (Hammie, whatever) + # XXX custom field shows up as blank. I haven't been able to figure + # XXX out why. Returning a float here sucks, as it gets displayed + # XXX with a useless ".00" tacked on to the end -- but at least it + # XXX isn't blank! + score = float(score) + if evidence: return score, the_evidence Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** msgstore.py 25 Oct 2002 17:57:15 -0000 1.8 --- msgstore.py 25 Oct 2002 18:47:08 -0000 1.9 *************** *** 295,298 **** --- 295,306 ---- def SetField(self, prop, val): + # XXX If the SpamProb (Hammie, whatever) property is passed in as an + # XXX int, Outlook displays the field as all blanks, and sorting on + # XXX it doesn't do anything, etc. I don't know why. Since I'm + # XXX running Python 2.2.2, the _MapiTypeMap above confuses ints + # XXX with bools, but the problem persists even if I comment out the + # XXX PT_BOOLEAN entry from that dict. Dumping in prints below show + # XXX that type_tag is 3 then, and that matches the defn of PT_I4 in + # XXX my system header files. self._EnsureObject() if type(prop)!=type(0): From tim_one@users.sourceforge.net Fri Oct 25 21:06:19 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 25 Oct 2002 13:06:19 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.18,1.19 msgstore.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv11173 Modified Files: addin.py msgstore.py Log Message: Gave GetEmailPackageObject() an optional strip_mime_headers arg, defaulting to True. This allows ShowClues() to display *all* the original headers. I don't know that this is going to be useful over time, but for right now I really want to see the info we're throwing away. Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -d -r1.18 -r1.19 *** addin.py 25 Oct 2002 17:47:36 -0000 1.18 --- addin.py 25 Oct 2002 20:06:06 -0000 1.19 *************** *** 143,148 **** push("

Message Stream:


") push("
\n")
!     txt = msgstore_message.GetEmailPackageObject().as_string()
!     push(escape(txt, True))
      push("
\n") body = ''.join(body) --- 143,148 ---- push("

Message Stream:


") push("
\n")
!     msg = msgstore_message.GetEmailPackageObject(strip_mime_headers=False)
!     push(escape(msg.as_string(), True))
      push("
\n") body = ''.join(body) *************** *** 153,157 **** new_msg.HTMLBody = "" + body + "" # Attach the source message to it ! new_msg.Attachments.Add(item, constants.olByValue, DisplayName="Original Message") new_msg.Display() --- 153,158 ---- new_msg.HTMLBody = "" + body + "" # Attach the source message to it ! new_msg.Attachments.Add(item, constants.olByValue, ! DisplayName="Original Message") new_msg.Display() Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** msgstore.py 25 Oct 2002 18:47:08 -0000 1.9 --- msgstore.py 25 Oct 2002 20:06:10 -0000 1.10 *************** *** 43,47 **** def __init__(self): self.unread = False ! def GetEmailPackageObject(self): # Return a "read-only" Python email package object # "read-only" in that changes will never be reflected to the real store. --- 43,47 ---- def __init__(self): self.unread = False ! def GetEmailPackageObject(self, strip_mime_headers=True): # Return a "read-only" Python email package object # "read-only" in that changes will never be reflected to the real store. *************** *** 275,279 **** self.mapi_object = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) ! def GetEmailPackageObject(self): import email # XXX If this was originally a MIME msg, we're hosed at this point -- --- 275,279 ---- self.mapi_object = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) ! def GetEmailPackageObject(self, strip_mime_headers=True): import email # XXX If this was originally a MIME msg, we're hosed at this point -- *************** *** 288,295 **** print "FAILED to create email.message from: ", `text` raise ! if msg.has_key('content-type'): ! del msg['content-type'] ! if msg.has_key('content-transfer-encoding'): ! del msg['content-transfer-encoding'] return msg --- 288,306 ---- print "FAILED to create email.message from: ", `text` raise ! ! if strip_mime_headers: ! # If we're going to pass this to a scoring function, the MIME ! # headers must be stripped, else the email pkg will run off ! # looking for MIME boundaries that don't exist. The charset ! # info from the original MIME armor is also lost, and we don't ! # want the email pkg to try decoding the msg a second time ! # (assuming Outlook is in fact already decoding text originally ! # in base64 and quoted-printable). ! # We want to retain the MIME headers if we're just displaying ! # the msg stream. ! if msg.has_key('content-type'): ! del msg['content-type'] ! if msg.has_key('content-transfer-encoding'): ! del msg['content-transfer-encoding'] return msg From popiel@users.sourceforge.net Sat Oct 26 03:07:09 2002 From: popiel@users.sourceforge.net (T. Alexander Popiel) Date: Fri, 25 Oct 2002 19:07:09 -0700 Subject: [Spambayes-checkins] spambayes table.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv29445 Modified Files: table.py Log Message: Fixed table.py to stagger ham:spam ratios across two lines if the entries are 8 characters or longer. Index: table.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/table.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** table.py 21 Oct 2002 21:18:55 -0000 1.2 --- table.py 26 Oct 2002 02:07:06 -0000 1.3 *************** *** 104,107 **** --- 104,108 ---- ratio = "ham:spam: " + rat2 = " " fptot = "fp total: " fpper = "fp %: " *************** *** 124,128 **** (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, hamdevall, spamdevall) = suck(file(filename)) ! ratio += "%8s" % ("%d:%d" % (htest, stest)) fptot += "%8d" % fp fpper += "%8.2f" % fpp --- 125,136 ---- (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, hamdevall, spamdevall) = suck(file(filename)) ! if len(ratio) > len(rat2): ! ratio += " " ! ratio = ratio[0:(len(rat2) + 8)] ! rat2 += " %7s" % ("%d:%d" % (htest, stest)) ! else: ! rat2 += " " ! rat2 = rat2[0:(len(ratio) + 8)] ! ratio += " %7s" % ("%d:%d" % (htest, stest)) fptot += "%8d" % fp fpper += "%8.2f" % fpp *************** *** 142,145 **** --- 150,155 ---- print ratio + if ":"[0] in rat2: + print rat2 print fptot print fpper From tim_one@users.sourceforge.net Sat Oct 26 04:41:17 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 25 Oct 2002 20:41:17 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.54,1.55 classifier.py,1.41,1.42 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv20721 Modified Files: Options.py classifier.py Log Message: Gave a named option (use_gary_combining) to the still-default combining scheme. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.54 retrieving revision 1.55 diff -C2 -d -r1.54 -r1.55 *** Options.py 25 Oct 2002 16:34:16 -0000 1.54 --- Options.py 26 Oct 2002 03:41:15 -0000 1.55 *************** *** 248,251 **** --- 248,258 ---- robinson_minimum_prob_strength: 0.1 + # The combining scheme currently detailed on Gary Robinon's web page. + # The middle ground here is touchy, varying across corpus, and within + # a corpus across amounts of training data. It almost never gives extreme + # scores (near 0.0 or 1.0), but the tail ends of the ham and spam + # distributions overlap. + use_gary_combining: True + # For vectors of random, uniformly distributed probabilities, -2*sum(ln(p_i)) # follows the chi-squared distribution with 2*n degrees of freedom. That's *************** *** 262,266 **** # In practice, it appears that setting ham_cutoff=0.05, and spam_cutoff=0.95, # does well across test sets; while these cutoffs are rarely optimal, they ! # get close to optimal. use_chi_squared_combining: False --- 269,275 ---- # In practice, it appears that setting ham_cutoff=0.05, and spam_cutoff=0.95, # does well across test sets; while these cutoffs are rarely optimal, they ! # get close to optimal. With more training data, Tim has had good luck ! # with ham_cutoff=0.30 and spam_cutoff=0.80 across three test data sets ! # (original c.l.p data, his own email, and newer general python.org traffic). use_chi_squared_combining: False *************** *** 319,322 **** --- 328,332 ---- 'robinson_probability_s': float_cracker, 'robinson_minimum_prob_strength': float_cracker, + 'use_gary_combining': boolean_cracker, 'use_chi_squared_combining': boolean_cracker, Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** classifier.py 18 Oct 2002 21:38:16 -0000 1.41 --- classifier.py 26 Oct 2002 03:41:15 -0000 1.42 *************** *** 20,24 **** # 1, whether or not the classification was correct. The false positives # and false negatives under Gary's scheme generally score in a narrow range ! # around the corpus's best spam_cutoff value # # This implementation is due to Tim Peters et alia. --- 20,29 ---- # 1, whether or not the classification was correct. The false positives # and false negatives under Gary's scheme generally score in a narrow range ! # around the corpus's best spam_cutoff value. ! # ! # THe chi-combining scheme here gets closer to the theoretical basis of ! # Gary's combining scheme, and does give extreme scores, but also has a ! # very useful middle ground (small # of msgs spread across a large range ! # of scores). # # This implementation is due to Tim Peters et alia. *************** *** 169,173 **** return prob ! spamprob = gary_spamprob # may be replaced by one of the next ones # Across vectors of length n, containing random uniformly-distributed --- 174,179 ---- return prob ! if options.use_gary_combining: ! spamprob = gary_spamprob # Across vectors of length n, containing random uniformly-distributed From tim_one@users.sourceforge.net Sat Oct 26 06:30:41 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Fri, 25 Oct 2002 22:30:41 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.25,1.26 msgstore.py,1.10,1.11 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv9773/Outlook2000 Modified Files: manager.py msgstore.py Log Message: I'm out of my depth here. Creating an PT_I4 integer score does work after all (provided I remove the 2.3-ism relying on "bool is not int"), BUT Outlook displays using it just don't work unless I run around manually adding a custom Integer field of the same name to every folder in which I want to see the thing. I still can't make sense of this. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** manager.py 25 Oct 2002 18:47:07 -0000 1.25 --- manager.py 26 Oct 2002 05:30:39 -0000 1.26 *************** *** 205,216 **** score = result score = int(round(score * 100.0)) - - # XXX If I actually return an int, the SpamProb (Hammie, whatever) - # XXX custom field shows up as blank. I haven't been able to figure - # XXX out why. Returning a float here sucks, as it gets displayed - # XXX with a useless ".00" tacked on to the end -- but at least it - # XXX isn't blank! - score = float(score) - if evidence: return score, the_evidence --- 205,208 ---- Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** msgstore.py 25 Oct 2002 20:06:10 -0000 1.10 --- msgstore.py 26 Oct 2002 05:30:39 -0000 1.11 *************** *** 168,172 **** type(''): PT_STRING8, type(u''): PT_UNICODE, ! type(1==1): PT_BOOLEAN, } --- 168,173 ---- type(''): PT_STRING8, type(u''): PT_UNICODE, ! # In Python 2.2.2, bool isn't a distinct type (type(1==1) is type(0)). ! # type(1==1): PT_BOOLEAN, } *************** *** 314,319 **** # XXX that type_tag is 3 then, and that matches the defn of PT_I4 in # XXX my system header files. self._EnsureObject() ! if type(prop)!=type(0): props = ( (mapi.PS_PUBLIC_STRINGS, prop), ) propIds = self.mapi_object.GetIDsFromNames(props, mapi.MAPI_CREATE) --- 315,323 ---- # XXX that type_tag is 3 then, and that matches the defn of PT_I4 in # XXX my system header files. + # XXX Later: This works after all, but the field shows up as all + # XXX blanks unless I *first* modify the view (like Messages) in + # XXX Outlook to define a custom Integer field of the same name. self._EnsureObject() ! if type(prop) != type(0): props = ( (mapi.PS_PUBLIC_STRINGS, prop), ) propIds = self.mapi_object.GetIDsFromNames(props, mapi.MAPI_CREATE) From popiel@users.sourceforge.net Sat Oct 26 16:30:26 2002 From: popiel@users.sourceforge.net (T. Alexander Popiel) Date: Sat, 26 Oct 2002 08:30:26 -0700 Subject: [Spambayes-checkins] spambayes table.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv15541 Modified Files: table.py Log Message: Added filename headers to the table. This will be useful for tables where the variables are things like classifier or tokenizer schemes. Index: table.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/table.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** table.py 26 Oct 2002 02:07:06 -0000 1.3 --- table.py 26 Oct 2002 15:30:23 -0000 1.4 *************** *** 103,106 **** --- 103,108 ---- return fn + fname = "filename: " + fnam2 = " " ratio = "ham:spam: " rat2 = " " *************** *** 125,128 **** --- 127,142 ---- (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost, hamdevall, spamdevall) = suck(file(filename)) + if filename.endswith('.txt'): + filename = filename[:-4] + filename = filename[filename.rfind('/')+1:] + filename = filename[filename.rfind("\\")+1:] + if len(fname) > len(fnam2): + fname += " " + fname = fname[0:(len(fnam2) + 8)] + fnam2 += " %7s" % filename + else: + fnam2 += " " + fnam2 = fnam2[0:(len(fname) + 8)] + fname += " %7s" % filename if len(ratio) > len(rat2): ratio += " " *************** *** 149,154 **** kval += "%8.2f" % k print ratio ! if ":"[0] in rat2: print rat2 print fptot --- 163,171 ---- kval += "%8.2f" % k + print fname + if len(fnam2.strip()) > 0: + print fnam2 print ratio ! if len(rat2.strip()) > 0: print rat2 print fptot From tim_one@users.sourceforge.net Sat Oct 26 17:01:17 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 26 Oct 2002 09:01:17 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.55,1.56 classifier.py,1.42,1.43 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv2382 Modified Files: Options.py classifier.py Log Message: Removed option use_mixed_combining and its supporting option mixed_combining_chi_weight It hasn't worked better for anyone. This leaves us with just gary- and chi-combining. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.55 retrieving revision 1.56 diff -C2 -d -r1.55 -r1.56 *** Options.py 26 Oct 2002 03:41:15 -0000 1.55 --- Options.py 26 Oct 2002 16:01:13 -0000 1.56 *************** *** 273,280 **** # (original c.l.p data, his own email, and newer general python.org traffic). use_chi_squared_combining: False - - # Use a weighted average of chi-combining and gary-combining. - use_mixed_combining: False - mixed_combining_chi_weight: 0.9 """ --- 273,276 ---- *************** *** 330,336 **** 'use_gary_combining': boolean_cracker, 'use_chi_squared_combining': boolean_cracker, - - 'use_mixed_combining': boolean_cracker, - 'mixed_combining_chi_weight': float_cracker, }, } --- 326,329 ---- Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** classifier.py 26 Oct 2002 03:41:15 -0000 1.42 --- classifier.py 26 Oct 2002 16:01:14 -0000 1.43 *************** *** 22,26 **** # around the corpus's best spam_cutoff value. # ! # THe chi-combining scheme here gets closer to the theoretical basis of # Gary's combining scheme, and does give extreme scores, but also has a # very useful middle ground (small # of msgs spread across a large range --- 22,26 ---- # around the corpus's best spam_cutoff value. # ! # The chi-combining scheme here gets closer to the theoretical basis of # Gary's combining scheme, and does give extreme scores, but also has a # very useful middle ground (small # of msgs spread across a large range *************** *** 35,39 **** from Options import options ! if options.use_chi_squared_combining or options.use_mixed_combining: from chi2 import chi2Q LN2 = math.log(2) --- 35,39 ---- from Options import options ! if options.use_chi_squared_combining: from chi2 import chi2Q LN2 = math.log(2) *************** *** 264,338 **** if options.use_chi_squared_combining: spamprob = chi2_spamprob - - # This is a weighted average of the other two. In extreme cases, they - # often seem to disagree on how "certain" they are. Mixing softens - # the extremes, pushing even some very hard cases into the middle ground. - def mixed_spamprob(self, wordstream, evidence=False): - """Return best-guess probability that wordstream is spam. - - wordstream is an iterable object producing words. - The return value is a float in [0.0, 1.0]. - - If optional arg evidence is True, the return value is a pair - probability, evidence - where evidence is a list of (word, probability) pairs. - """ - - from math import frexp, log as ln - - H = S = 1.0 - Hexp = Sexp = 0 - - clues = self._getclues(wordstream) - for prob, word, record in clues: - if record is not None: # else wordinfo doesn't know about it - record.killcount += 1 - S *= 1.0 - prob - H *= prob - if S < 1e-200: # prevent underflow - S, e = frexp(S) - Sexp += e - if H < 1e-200: # prevent underflow - H, e = frexp(H) - Hexp += e - - n = len(clues) - if n: - nrecip = 1.0 / n - P = 1.0 - S**nrecip * 2.0**(Sexp * nrecip) - Q = 1.0 - H**nrecip * 2.0**(Hexp * nrecip) - - S = ln(S) + Sexp * LN2 - H = ln(H) + Hexp * LN2 - S = 1.0 - chi2Q(-2.0 * S, 2*n) - H = 1.0 - chi2Q(-2.0 * H, 2*n) - - else: - P = Q = S = H = 1.0 - - gary_score = P/(P+Q) - chi_score = (S-H + 1.0) / 2.0 - - w = options.mixed_combining_chi_weight - prob = w * chi_score + (1.0 - w) * gary_score - - if evidence: - clues = [(w, p) for p, w, r in clues] - clues.sort(lambda a, b: cmp(a[1], b[1])) - extra = [('*chi_score*', chi_score), - ('*gary_score*', gary_score), - ('*S*', S), - ('*H*', H), - ('*P*', P), - ('*Q*', Q), - ('*n*', n), - ] - clues[0:0] = extra - return prob, clues - else: - return prob - - if options.use_mixed_combining: - spamprob = mixed_spamprob def learn(self, wordstream, is_spam, update_probabilities=True): --- 264,267 ---- From tim_one@users.sourceforge.net Sat Oct 26 17:15:46 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 26 Oct 2002 09:15:46 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.56,1.57 tokenizer.py,1.48,1.49 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv11212 Modified Files: Options.py tokenizer.py Log Message: Removed option ignore_redundant_html. This made some kind of sense in the early c.l.py tests, before we stripped HTML tags; it doesn't make sense anymore. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** Options.py 26 Oct 2002 16:01:13 -0000 1.56 --- Options.py 26 Oct 2002 16:15:38 -0000 1.57 *************** *** 38,53 **** # sign of HTML is so despised on tech lists; however, the advantage # of setting it true eventually vanishes even there given enough ! # training data. If you set this true, you should almost certainly set ! # ignore_redundant_html true too. retain_pure_html_tags: False - # If true, when a multipart/alternative has both text/plain and text/html - # sections, the text/html section is ignored. That's likely a dubious - # idea in general, so false is likely a better idea here. In the c.l.py - # tests, it helped a lot when retain_pure_html_tags was true (in that case, - # keeping the HTML tags in the "redundant" HTML was almost certain to score - # the multipart/alternative as spam, regardless of content). - ignore_redundant_html: False - # If true, the first few characters of application/octet-stream sections # are used, undecoded. What 'few' means is decided by octet_prefix_size. --- 38,44 ---- # sign of HTML is so despised on tech lists; however, the advantage # of setting it true eventually vanishes even there given enough ! # training data. retain_pure_html_tags: False # If true, the first few characters of application/octet-stream sections # are used, undecoded. What 'few' means is decided by octet_prefix_size. *************** *** 282,286 **** all_options = { 'Tokenizer': {'retain_pure_html_tags': boolean_cracker, - 'ignore_redundant_html': boolean_cracker, 'safe_headers': ('get', lambda s: Set(s.split())), 'count_all_header_lines': boolean_cracker, --- 273,276 ---- Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** tokenizer.py 25 Oct 2002 16:34:19 -0000 1.48 --- tokenizer.py 26 Oct 2002 16:15:40 -0000 1.49 *************** *** 490,494 **** # the text/plain and text/html alternatives may have entirely different # content. options.ignore_redundant_html was introduced to control this, ! # and it defaults to False. ############################################################################## --- 490,494 ---- # the text/plain and text/html alternatives may have entirely different # content. options.ignore_redundant_html was introduced to control this, ! # and it defaults to False. Later: ignore_redundant_html was removed. ############################################################################## *************** *** 514,562 **** # textparts(msg) returns a set containing all the text components of msg. ! # There's no point decoding binary blobs (like images). ! ! if options.ignore_redundant_html: ! # If a multipart/alternative has both plain text and HTML versions of a ! # msg, ignore the HTML part: HTML decorations have monster-high spam ! # probabilities, and innocent newbies often post using HTML. ! def textparts(msg): ! text = Set() ! redundant_html = Set() ! for part in msg.walk(): ! if part.get_type() == 'multipart/alternative': ! # Descend this part of the tree, adding any redundant HTML text ! # part to redundant_html. ! htmlpart = textpart = None ! stack = part.get_payload()[:] ! while stack: ! subpart = stack.pop() ! ctype = subpart.get_type('text/plain') ! if ctype == 'text/plain': ! textpart = subpart ! elif ctype == 'text/html': ! htmlpart = subpart ! elif ctype == 'multipart/related': ! stack.extend(subpart.get_payload()) ! ! if textpart is not None: ! text.add(textpart) ! if htmlpart is not None: ! redundant_html.add(htmlpart) ! elif htmlpart is not None: ! text.add(htmlpart) ! ! elif part.get_content_maintype() == 'text': ! text.add(part) ! ! return text - redundant_html ! ! else: ! # Use all text parts. If a text/plain and text/html part happen to ! # have redundant content, so it goes. ! def textparts(msg): ! return Set(filter(lambda part: part.get_content_maintype() == 'text', ! msg.walk())) def octetparts(msg): return Set(filter(lambda part: part.get_type() == 'application/octet-stream', --- 514,528 ---- # textparts(msg) returns a set containing all the text components of msg. ! # There's no point decoding binary blobs (like images). If a text/plain ! # and text/html part happen to have redundant content, it doesn't matter ! # to results, since training and scoring are done on the set of all ! # words in the msg, without regard to how many times a given word appears. ! def textparts(msg): ! """Return a set of all msg parts with content maintype 'text'.""" ! return Set(filter(lambda part: part.get_content_maintype() == 'text', ! msg.walk())) def octetparts(msg): + """Return a set of all msg parts with type 'application/octet-stream'.""" return Set(filter(lambda part: part.get_type() == 'application/octet-stream', *************** *** 1056,1064 **** it's recommended to leave that at its default of false. - If a multipart/alternative section has both text/plain and text/html - sections, options.ignore_redundant_html controls whether the HTML - part is ignored. Except in special cases, it's recommended to - leave that at its default of false. - If options.check_octets is True, the first few undecoded characters of application/octet-stream parts of the message body become tokens. --- 1022,1025 ---- *************** *** 1067,1071 **** if options.check_octets: # Find, decode application/octet-stream parts of the body, ! # tokenizing the first few characters of each chunk for part in octetparts(msg): text = part.get_payload(decode=False) --- 1028,1032 ---- if options.check_octets: # Find, decode application/octet-stream parts of the body, ! # tokenizing the first few characters of each chunk. for part in octetparts(msg): text = part.get_payload(decode=False) From tim_one@users.sourceforge.net Sat Oct 26 18:11:39 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 26 Oct 2002 10:11:39 -0700 Subject: [Spambayes-checkins] spambayes tokenizer.py,1.49,1.50 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv13112 Modified Files: tokenizer.py Log Message: tokenize_body(): The check_octets option wasn't actually decoding the octet-stream. Repaired that and beefed it up. In my python.org corpus, it does generate a few useful clues, but there were enough other clues in those cases that it didn't change the error or unsure rates. Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.49 retrieving revision 1.50 diff -C2 -d -r1.49 -r1.50 *** tokenizer.py 26 Oct 2002 16:15:40 -0000 1.49 --- tokenizer.py 26 Oct 2002 17:11:37 -0000 1.50 *************** *** 1030,1034 **** # tokenizing the first few characters of each chunk. for part in octetparts(msg): ! text = part.get_payload(decode=False) yield "octet:%s" % text[:options.octet_prefix_size] --- 1030,1043 ---- # tokenizing the first few characters of each chunk. for part in octetparts(msg): ! try: ! text = part.get_payload(decode=True) ! except: ! yield "control: couldn't decode octet" ! text = part.get_payload(decode=False) ! ! if text is None: ! yield "control: octet payload is None" ! continue ! yield "octet:%s" % text[:options.octet_prefix_size] From tim_one@users.sourceforge.net Sun Oct 27 03:43:00 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 26 Oct 2002 20:43:00 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.57,1.58 classifier.py,1.43,1.44 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv22765 Modified Files: Options.py classifier.py Log Message: Make chi-combining the default. Add [Classifier] use_chi_combining: False use_gary_combining: True if you want to use the former default for scoring. The combining scheme is purely a scoring-time decision. It has no effect on training; there's no need to retrain your database. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.57 retrieving revision 1.58 diff -C2 -d -r1.57 -r1.58 *** Options.py 26 Oct 2002 16:15:38 -0000 1.57 --- Options.py 27 Oct 2002 03:42:58 -0000 1.58 *************** *** 111,120 **** # ham_cutoff > spam_cutoff doesn't make sense. # ! # The defaults are for the all-default Robinson scheme, which makes a ! # binary decision with no middle ground. The precise value that works ! # best is corpus-dependent, and values into the .600's have been known ! # to work best on some data. ! ham_cutoff: 0.560 ! spam_cutoff: 0.560 # Number of buckets in histograms. --- 111,128 ---- # ham_cutoff > spam_cutoff doesn't make sense. # ! # The defaults here (.2 and .9) may be appropriate for the default chi- ! # combining scheme. Cutoffs for chi-combining typically aren't touchy, ! # provided you're willing to settle for "really good" instead of "optimal". ! # Tim found that .3 and .8 worked very well for well-trained systems on ! # his personal email, and his large comp.lang.python test. If just beginning ! # training, or extremely fearful of mistakes, 0.05 and 0.95 may be more ! # appropriate for you. ! # ! # Picking good values for gary-combining is much harder, and appears to be ! # corpus-dependent, and within a single corpus dependent on how much ! # training has been done. Values from 0.50 thru the low 0.60's have been ! # reported to work best by various testers on their data. ! ham_cutoff: 0.20 ! spam_cutoff: 0.90 # Number of buckets in histograms. *************** *** 244,248 **** # scores (near 0.0 or 1.0), but the tail ends of the ham and spam # distributions overlap. ! use_gary_combining: True # For vectors of random, uniformly distributed probabilities, -2*sum(ln(p_i)) --- 252,256 ---- # scores (near 0.0 or 1.0), but the tail ends of the ham and spam # distributions overlap. ! use_gary_combining: False # For vectors of random, uniformly distributed probabilities, -2*sum(ln(p_i)) *************** *** 263,267 **** # with ham_cutoff=0.30 and spam_cutoff=0.80 across three test data sets # (original c.l.p data, his own email, and newer general python.org traffic). ! use_chi_squared_combining: False """ --- 271,275 ---- # with ham_cutoff=0.30 and spam_cutoff=0.80 across three test data sets # (original c.l.p data, his own email, and newer general python.org traffic). ! use_chi_squared_combining: True """ Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.43 retrieving revision 1.44 diff -C2 -d -r1.43 -r1.44 *** classifier.py 26 Oct 2002 16:01:14 -0000 1.43 --- classifier.py 27 Oct 2002 03:42:58 -0000 1.44 *************** *** 9,14 **** # rates over Paul's original description. # ! # This code implements Gary Robinson's suggestions, which are well explained ! # on his webpage: # # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html --- 9,14 ---- # rates over Paul's original description. # ! # This code implements Gary Robinson's suggestions, the core of which are ! # well explained on his webpage: # # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html *************** *** 19,29 **** # the scores under Paul's scheme were almost always very near 0 or very near # 1, whether or not the classification was correct. The false positives ! # and false negatives under Gary's scheme generally score in a narrow range ! # around the corpus's best spam_cutoff value. # ! # The chi-combining scheme here gets closer to the theoretical basis of ! # Gary's combining scheme, and does give extreme scores, but also has a ! # very useful middle ground (small # of msgs spread across a large range ! # of scores). # # This implementation is due to Tim Peters et alia. --- 19,31 ---- # the scores under Paul's scheme were almost always very near 0 or very near # 1, whether or not the classification was correct. The false positives ! # and false negatives under Gary's basic scheme (use_gary_combining) generally ! # score in a narrow range around the corpus's best spam_cutoff value. ! # However, it doesn't appear possible to guess the best spam_cutoff value in ! # advance, and it's touchy. # ! # The chi-combining scheme used by default here gets closer to the theoretical ! # basis of Gary's combining scheme, and does give extreme scores, but also ! # has a very useful middle ground (small # of msgs spread across a large range ! # of scores, and good cutoff values aren't touchy). # # This implementation is due to Tim Peters et alia. *************** *** 34,45 **** from Options import options ! ! if options.use_chi_squared_combining: ! from chi2 import chi2Q ! LN2 = math.log(2) ! ! # The maximum number of extreme words to look at in a msg, where "extreme" ! # means with spamprob farthest away from 0.5. ! MAX_DISCRIMINATORS = options.max_discriminators # 150 PICKLE_VERSION = 1 --- 36,41 ---- from Options import options ! from chi2 import chi2Q ! LN2 = math.log(2) # used frequently by chi-combining PICKLE_VERSION = 1 From tim_one@users.sourceforge.net Sun Oct 27 03:43:00 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 26 Oct 2002 20:43:00 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 default_bayes_customize.ini,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv22765/Outlook2000 Modified Files: default_bayes_customize.ini Log Message: Make chi-combining the default. Add [Classifier] use_chi_combining: False use_gary_combining: True if you want to use the former default for scoring. The combining scheme is purely a scoring-time decision. It has no effect on training; there's no need to retrain your database. Index: default_bayes_customize.ini =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/default_bayes_customize.ini,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** default_bayes_customize.ini 25 Oct 2002 16:34:23 -0000 1.3 --- default_bayes_customize.ini 27 Oct 2002 03:42:58 -0000 1.4 *************** *** 15,17 **** [Classifier] ! #use_chi_squared_combining: True --- 15,20 ---- [Classifier] ! # Uncomment the next lines if you want to use the former default for ! # scoring. ! #use_chi_squared_combining: False ! #use_gary_combining: True From tim_one@users.sourceforge.net Sun Oct 27 03:59:54 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 26 Oct 2002 20:59:54 -0700 Subject: [Spambayes-checkins] spambayes hammie.py,1.29,1.30 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv32422 Modified Files: hammie.py Log Message: A patch from Rob Hooft, teaching hammie about ham_cutoff too, and introducing an Unsure X-Hammie-Disposition. I haven't used hammie, so someone who does should check this. I felt it was important to get this in ASAP, since I just changed the default scheme to chi-combining, and it's important to be aware of its middle ground. Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammie.py,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** hammie.py 6 Oct 2002 23:07:23 -0000 1.29 --- hammie.py 27 Oct 2002 03:59:52 -0000 1.30 *************** *** 60,63 **** --- 60,64 ---- # Probability at which a message is considered spam SPAM_THRESHOLD = options.spam_cutoff + HAM_THRESHOLD = options.ham_cutoff # Tim's tokenizer kicks far more booty than anything I would have *************** *** 228,232 **** traceback.print_exc() ! def filter(self, msg, header=DISPHEADER, cutoff=SPAM_THRESHOLD): """Score (judge) a message and add a disposition header. --- 229,234 ---- traceback.print_exc() ! def filter(self, msg, header=DISPHEADER, spam_cutoff=SPAM_THRESHOLD, ! ham_cutoff=HAM_THRESHOLD): """Score (judge) a message and add a disposition header. *************** *** 246,253 **** msg = email.message_from_string(msg) prob, clues = self._scoremsg(msg, True) ! if prob < cutoff: disp = "No" ! else: disp = "Yes" disp += "; %.2f" % prob disp += "; " + self.formatclues(clues) --- 248,257 ---- msg = email.message_from_string(msg) prob, clues = self._scoremsg(msg, True) ! if prob < ham_cutoff: disp = "No" ! elif prob > spam_cutoff: disp = "Yes" + else: + disp = "Unsure" disp += "; %.2f" % prob disp += "; " + self.formatclues(clues) From tim_one@users.sourceforge.net Sun Oct 27 05:13:57 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 26 Oct 2002 22:13:57 -0700 Subject: [Spambayes-checkins] spambayes hammiecli.py,1.1,1.2 hammiesrv.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv11299 Modified Files: hammiecli.py hammiesrv.py Log Message: Whitespace normalization. Index: hammiecli.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiecli.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** hammiecli.py 17 Oct 2002 18:19:41 -0000 1.1 --- hammiecli.py 27 Oct 2002 05:13:54 -0000 1.2 *************** *** 28,30 **** if __name__ == "__main__": main() - --- 28,29 ---- Index: hammiesrv.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiesrv.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** hammiesrv.py 17 Oct 2002 21:30:13 -0000 1.7 --- hammiesrv.py 27 Oct 2002 05:13:55 -0000 1.8 *************** *** 72,76 **** response = (response,) except: ! traceback.print_exc() # report exception back to server response = xmlrpclib.dumps( --- 72,76 ---- response = (response,) except: ! traceback.print_exc() # report exception back to server response = xmlrpclib.dumps( From tim_one@users.sourceforge.net Sun Oct 27 05:13:57 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 26 Oct 2002 22:13:57 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000/dialogs FolderSelector.py,1.3,1.4 RuleDialog.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000/dialogs In directory usw-pr-cvs1:/tmp/cvs-serv11299/Outlook2000/dialogs Modified Files: FolderSelector.py RuleDialog.py Log Message: Whitespace normalization. Index: FolderSelector.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/FolderSelector.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** FolderSelector.py 25 Oct 2002 06:58:20 -0000 1.3 --- FolderSelector.py 27 Oct 2002 05:13:55 -0000 1.4 *************** *** 64,68 **** children.append(spec) return children ! def BuildFolderTreeMAPI(session): root = FolderSpec(None, "root") --- 64,68 ---- children.append(spec) return children ! def BuildFolderTreeMAPI(session): root = FolderSpec(None, "root") Index: RuleDialog.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/dialogs/RuleDialog.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** RuleDialog.py 25 Oct 2002 17:47:37 -0000 1.5 --- RuleDialog.py 27 Oct 2002 05:13:55 -0000 1.6 *************** *** 189,193 **** if __name__=='__main__': # This doesn't work ! class Rule: def __init__(self): --- 189,193 ---- if __name__=='__main__': # This doesn't work ! class Rule: def __init__(self): From tim_one@users.sourceforge.net Sun Oct 27 05:13:57 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sat, 26 Oct 2002 22:13:57 -0700 Subject: [Spambayes-checkins] spambayes/Outlook2000 train.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv11299/Outlook2000 Modified Files: train.py Log Message: Whitespace normalization. Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** train.py 25 Oct 2002 15:20:25 -0000 1.9 --- train.py 27 Oct 2002 05:13:55 -0000 1.10 *************** *** 30,34 **** mgr.bayes_dirty = True return True ! def train_folder( f, isspam, mgr, progress): num = num_added = 0 --- 30,34 ---- mgr.bayes_dirty = True return True ! def train_folder( f, isspam, mgr, progress): num = num_added = 0 From montanaro@users.sourceforge.net Sun Oct 27 05:26:03 2002 From: montanaro@users.sourceforge.net (Skip Montanaro) Date: Sat, 26 Oct 2002 22:26:03 -0700 Subject: [Spambayes-checkins] spambayes Options.py,1.58,1.59 tokenizer.py,1.50,1.51 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv15741 Modified Files: Options.py tokenizer.py Log Message: Add two new options: extract_dow and generate_time_buckets. I see an ever-so-slight improvement using a 10x10 cv grid w/ 200 ham and spam in each set. Checking in so others can test it. The extract_dow option causes tokens like dow:0 (which is Monday) to be generated. The generate_time_buckets option causes tokens like time:12:3 to be generated. (12 is the hour and 3 indicates the minutes was in the fourth six-minute bucket.) I've done nothing to test the bucket granularity. Lawyers bill in six-minute increments. Seemed good enough for me. ;-) Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.58 retrieving revision 1.59 diff -C2 -d -r1.58 -r1.59 *** Options.py 27 Oct 2002 03:42:58 -0000 1.58 --- Options.py 27 Oct 2002 05:26:01 -0000 1.59 *************** *** 91,94 **** --- 91,101 ---- generate_long_skips: True + # Generate tokens which resemble the posting time in 6-minute buckets: + # int((h*60+m)/10). + generate_time_buckets: False + + # Extract day of the week tokens from the Date: header. + extract_dow: False + # If true, replace high-bit characters (ord(c) >= 128) and control characters # with question marks. This allows non-ASCII character strings to be *************** *** 284,287 **** --- 291,296 ---- 'count_all_header_lines': boolean_cracker, 'generate_long_skips': boolean_cracker, + 'extract_dow': boolean_cracker, + 'generate_time_buckets': boolean_cracker, 'mine_received_headers': boolean_cracker, 'check_octets': boolean_cracker, Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.50 retrieving revision 1.51 diff -C2 -d -r1.50 -r1.51 *** tokenizer.py 26 Oct 2002 17:11:37 -0000 1.50 --- tokenizer.py 27 Oct 2002 05:26:01 -0000 1.51 *************** *** 9,12 **** --- 9,13 ---- import re import math + import time from sets import Set *************** *** 839,842 **** --- 840,852 ---- class Tokenizer: + date_hms_re = re.compile(r' (?P[0-9][0-9]):' + r'(?P[0-9][0-9]):' + r'(?P[0-9][0-9]) ') + + date_formats = ("%a, %d %b %Y %H:%M:%S (%Z)", + "%a, %d %b %Y %H:%M:%S %Z", + "%d %b %Y %H:%M:%S (%Z)", + "%d %b %Y %H:%M:%S %Z") + def __init__(self): if options.basic_header_tokenize: *************** *** 982,985 **** --- 992,1020 ---- for tok in breakdown(m.group(1).lower()): yield 'received:' + tok + + # Date: + if options.generate_time_buckets: + for header in msg.get_all("date", ()): + mat = self.date_hms_re.search(header) + # return the time in Date: headers arranged in + # six-minute buckets + if mat is not None: + h = int(mat.group('hour')) + bucket = int(mat.group('minute')) // 10 + yield 'time:%02d:%d' % (h, bucket) + + if options.extract_dow: + for header in msg.get_all("date", ()): + # extract the day of the week + for fmt in self.date_formats: + try: + timetuple = time.strptime(header, fmt) + except ValueError: + pass + else: + yield 'dow:%d' % timetuple[6] + else: + # if nothing matches, declare the Date: header invalid + yield 'dow:invalid' # Message-Id: This seems to be a small win and should not From tim_one@users.sourceforge.net Sun Oct 27 16:55:04 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 27 Oct 2002 08:55:04 -0800 Subject: [Spambayes-checkins] spambayes README.txt,1.38,1.39 setup.py,1.7,1.8 cvcost.py,1.5,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv13311 Modified Files: README.txt setup.py Removed Files: cvcost.py Log Message: Added info about table.py. Removed cvcost.py (histogram analysis does this now, and with finer resolution). Updated assorted parts of README. Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** README.txt 18 Oct 2002 05:44:04 -0000 1.38 --- README.txt 27 Oct 2002 16:55:00 -0000 1.39 *************** *** 55,73 **** the classifier trains on or predicts against. - Tester.py - A test-driver class that feeds streams of msgs to a classifier - instance, and keeps track of right/wrong percentages, and lists - of false positives and false negatives. - - TestDriver.py - A flexible higher layer of test helpers, building on Tester above. - For example, it's usable for building simple test drivers, NxN test - grids, and N-fold cross validation drivers. See also rates.py and - cmp.py below. - - msgs.py - Some simple classes to wrap raw msgs, and to produce streams of - msgs. The test drivers use these. - chi2.py A collection of statistics functions. --- 55,58 ---- *************** *** 103,106 **** --- 88,109 ---- + Test Driver Core + ================ + Tester.py + A test-driver class that feeds streams of msgs to a classifier + instance, and keeps track of right/wrong percentages, and lists + of false positives and false negatives. + + TestDriver.py + A flexible higher layer of test helpers, building on Tester above. + For example, it's usable for building simple test drivers, NxN test + grids, and N-fold cross-validation drivers. See also rates.py, + cmp.py, and table.py below. + + msgs.py + Some simple classes to wrap raw msgs, and to produce streams of + msgs. The test drivers use these. + + Concrete Test Drivers ===================== *************** *** 113,117 **** directory setup (see below)) rather than the specialized mboxtest setup. ! N classifiers are built 1 run is done with each classifier. Each classifier is trained on N-1 sets, and predicts against the sole --- 116,120 ---- directory setup (see below)) rather than the specialized mboxtest setup. ! N classifiers are built. 1 run is done with each classifier. Each classifier is trained on N-1 sets, and predicts against the sole *************** *** 150,153 **** --- 153,160 ---- and the change in average f-p and f-n rates. + table.py + Summarizes the high-order bits from any number of summary files, + in a compact table. + fpfn.py Given one or more TestDriver output files, prints list of false *************** *** 196,209 **** structure (below), you can run this thing, go have some tea while it works, then paste the output to the spambayes list for good karma. - - - Experimental Files - ================== - cvcost.py - A program that analyzes the output of timcv.py (the final histograms) - and optimizes the cost of handling the mail body by defining a "ham" - zone, a "spam" zone and a "grey" zone. It can be tuned by choosing - pseudo-realistic costs to handle a fp, a fn and to handle a message - in the grey zone. --- 203,206 ---- Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** setup.py 27 Sep 2002 22:30:23 -0000 1.7 --- setup.py 27 Oct 2002 16:55:01 -0000 1.8 *************** *** 17,21 **** 'neiltrain.py', 'cmp.py', ! 'rates.py'], py_modules=['classifier', 'tokenizer', --- 17,23 ---- 'neiltrain.py', 'cmp.py', ! 'table.py', ! 'rates.py', ! ], py_modules=['classifier', 'tokenizer', *************** *** 25,28 **** 'Tester', 'TestDriver', ! 'mboxutils'] ) --- 27,31 ---- 'Tester', 'TestDriver', ! 'mboxutils', ! ] ) --- cvcost.py DELETED --- From tim_one@users.sourceforge.net Sun Oct 27 17:11:06 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 27 Oct 2002 09:11:06 -0800 Subject: [Spambayes-checkins] spambayes classifier.py,1.44,1.45 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv18039 Modified Files: classifier.py Log Message: Removed __slots__ from class Bayes, to ease Jeremy's attempts to hook this up to ZODB. Old pickles should continue to load without trouble. Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** classifier.py 27 Oct 2002 03:42:58 -0000 1.44 --- classifier.py 27 Oct 2002 17:11:00 -0000 1.45 *************** *** 81,88 **** class Bayes(object): ! __slots__ = ('wordinfo', # map word to WordInfo record ! 'nspam', # number of spam messages learn() has seen ! 'nham', # number of non-spam messages learn() has seen ! ) def __init__(self): --- 81,94 ---- class Bayes(object): ! # Defining __slots__ here made Jeremy's life needlessly difficult when ! # trying to hook this all up to ZODB as a persistent object. There's ! # no space benefit worth getting from slots in this class; slots were ! # used solely to help catch errors earlier, when this code was changing ! # rapidly. ! ! #__slots__ = ('wordinfo', # map word to WordInfo record ! # 'nspam', # number of spam messages learn() has seen ! # 'nham', # number of non-spam messages learn() has seen ! # ) def __init__(self): From tim_one@users.sourceforge.net Sun Oct 27 21:35:02 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 27 Oct 2002 13:35:02 -0800 Subject: [Spambayes-checkins] spambayes mboxutils.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv7428 Modified Files: mboxutils.py Log Message: Patch from Toby Dickenson to recognize Maildir directories. Untested! Index: mboxutils.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mboxutils.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** mboxutils.py 4 Oct 2002 19:41:36 -0000 1.2 --- mboxutils.py 27 Oct 2002 21:35:00 -0000 1.3 *************** *** 11,14 **** --- 11,16 ---- /foo/bar/ -- (existing directory) a directory full of .txt and .lorien files + /foo/bar/ -- (existing directory with a cur/ subdirectory) + Maildir mailbox /foo/Mail/bar/ -- (existing directory with /Mail/ in its path) alternative way of spelling an MH mailbox *************** *** 80,86 **** if os.path.isdir(name): ! # XXX Bogus: use an MHMailbox if the pathname contains /Mail/, ! # else a DirOfTxtFileMailbox. ! if name.find("/Mail/") >= 0: mbox = mailbox.MHMailbox(name, _factory) else: --- 82,90 ---- if os.path.isdir(name): ! # XXX Bogus: use a Maildir if /cur is a subdirectory, else a MHMailbox ! # if the pathname contains /Mail/, else a DirOfTxtFileMailbox. ! if os.path.exists(os.path.join(name, 'cur')): ! mbox = mailbox.Maildir(name, _factory) ! elif name.find("/Mail/") >= 0: mbox = mailbox.MHMailbox(name, _factory) else: From tim_one@users.sourceforge.net Sun Oct 27 21:40:27 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 27 Oct 2002 13:40:27 -0800 Subject: [Spambayes-checkins] spambayes hammie.py,1.30,1.31 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv9777 Modified Files: hammie.py Log Message: Patch from Alexander Leidinger to deal w/ multiple -u options correctly. Untested! Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammie.py,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** hammie.py 27 Oct 2002 03:59:52 -0000 1.30 --- hammie.py 27 Oct 2002 21:40:25 -0000 1.31 *************** *** 443,449 **** if len(unknown) > 1: print "Scoring", u ! s, h = score(h, u, reverse) spams += s ! hams += h print "Total %d spam, %d ham" % (spams, hams) --- 443,449 ---- if len(unknown) > 1: print "Scoring", u ! s, g = score(h, u, reverse) spams += s ! hams += g print "Total %d spam, %d ham" % (spams, hams) From montanaro@users.sourceforge.net Sun Oct 27 21:59:57 2002 From: montanaro@users.sourceforge.net (Skip Montanaro) Date: Sun, 27 Oct 2002 13:59:57 -0800 Subject: [Spambayes-checkins] spambayes setup.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv18657 Modified Files: setup.py Log Message: + hammiecli.py to scripts + chi2, Histogram to modules Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** setup.py 27 Oct 2002 16:55:01 -0000 1.8 --- setup.py 27 Oct 2002 21:59:55 -0000 1.9 *************** *** 5,8 **** --- 5,9 ---- scripts=['unheader.py', 'hammie.py', + 'hammiecli.py', 'hammiesrv.py', 'loosecksum.py', *************** *** 24,27 **** --- 25,30 ---- 'hammie', 'msgs', + 'chi2', + 'Histogram', 'Options', 'Tester', From montanaro@users.sourceforge.net Sun Oct 27 22:04:34 2002 From: montanaro@users.sourceforge.net (Skip Montanaro) Date: Sun, 27 Oct 2002 14:04:34 -0800 Subject: [Spambayes-checkins] spambayes README.txt,1.39,1.40 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv21052 Modified Files: README.txt Log Message: fix a couple typos and indicate that rebal helps rebalance either spam or ham directory trees. Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** README.txt 27 Oct 2002 16:55:00 -0000 1.39 --- README.txt 27 Oct 2002 22:04:32 -0000 1.40 *************** *** 229,232 **** --- 229,233 ---- Set9/ "" Set10/ "" + reservoir/ (contains "backup spam") Ham/ Set1/ (contains 2000 ham .txt files) *************** *** 243,247 **** Every file at the deepest level is used (not just files with .txt ! extenstions). The files may bot don't need to have a "Unix From" header before the RFC-822 message (i.e. a line of the form "From
"). --- 244,248 ---- Every file at the deepest level is used (not just files with .txt ! extensions). The files don't need to have a "Unix From" header before the RFC-822 message (i.e. a line of the form "From
"). *************** *** 249,256 **** If you use the same names and structure, huge mounds of the tedious testing code will work as-is. The more Set directories the merrier, although you ! want at least a few hundred messages in each one. The "reservoir" directory ! contains a few thousand other random hams. When a ham is found that's ! really spam, move into a spam directory, and then the rebal.py utility ! moves in a random message from the reservoir to replace it. The hams are 20,000 msgs selected at random from a python-list archive. --- 250,259 ---- If you use the same names and structure, huge mounds of the tedious testing code will work as-is. The more Set directories the merrier, although you ! want at least a few hundred messages in each one. The "reservoir" ! directories contain a few thousand other random hams and spams. When a ham ! is found that's really spam, move it into a spam directory, then use the ! rebal.py utility to rebalance the Set directories moving random message(s) ! into and/or out of the reservoir directories. The reverse works as well ! (finding ham in your spam directories). The hams are 20,000 msgs selected at random from a python-list archive. From hooft@users.sourceforge.net Sun Oct 27 22:07:45 2002 From: hooft@users.sourceforge.net (Rob W.W. Hooft) Date: Sun, 27 Oct 2002 14:07:45 -0800 Subject: [Spambayes-checkins] spambayes hammie.py,1.31,1.32 Options.py,1.59,1.60 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv21039 Modified Files: hammie.py Options.py Log Message: * Make hammie.py use Options.py for its configurables * Change quoting and apostrophes in default text to make python-mode.el parse Options.py OK * Add option to hammie.py that reduces the number of clues shown in the X-Hammie-Disposition header * Add -D command line option to hammie.py to allow "unsetting" the database option, necessary now that database mode can be set to be the default Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammie.py,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** hammie.py 27 Oct 2002 21:40:25 -0000 1.31 --- hammie.py 27 Oct 2002 22:07:42 -0000 1.32 *************** *** 23,31 **** -p FILE use file as the persistent store. loads data from this file if it ! exists, and saves data to this file at the end. Default: %(DEFAULTDB)s -d use the DBM store instead of cPickle. The file is larger and creating it is slower, but checking against it is much faster, ! especially for large word databases. -f run as a filter: read a single message from stdin, add an --- 23,34 ---- -p FILE use file as the persistent store. loads data from this file if it ! exists, and saves data to this file at the end. ! Default: %(DEFAULTDB)s -d use the DBM store instead of cPickle. The file is larger and creating it is slower, but checking against it is much faster, ! especially for large word databases. Default: %(USEDB)s ! -D ! the reverse of -d: use the cPickle instead of DBM -f run as a filter: read a single message from stdin, add an *************** *** 53,60 **** # Name of the header to add in filter mode ! DISPHEADER = "X-Hammie-Disposition" # Default database name ! DEFAULTDB = "hammie.db" # Probability at which a message is considered spam --- 56,63 ---- # Name of the header to add in filter mode ! DISPHEADER = options.hammie_header_name # Default database name ! DEFAULTDB = options.persistant_storage_file # Probability at which a message is considered spam *************** *** 62,65 **** --- 65,74 ---- HAM_THRESHOLD = options.ham_cutoff + # Probability limit for a clue to be added to the DISPHEADER + SHOWCLUE = options.clue_mailheader_cutoff + + # Use a database? If False, use a pickle + USEDB = options.persistant_use_database + # Tim's tokenizer kicks far more booty than anything I would have # written. Score one for analysis ;) *************** *** 209,213 **** """Format the clues into something readable.""" ! return sep.join(["%r: %.2f" % (word, prob) for word, prob in clues]) def score(self, msg, evidence=False): --- 218,225 ---- """Format the clues into something readable.""" ! return sep.join(["%r: %.2f" % (word, prob) ! for word, prob in clues ! if (word[0] == '*' or ! prob <= SHOWCLUE or prob >= 1.0 - SHOWCLUE)]) def score(self, msg, evidence=False): *************** *** 378,382 **** """Main program; parse options and go.""" try: ! opts, args = getopt.getopt(sys.argv[1:], 'hdfg:s:p:u:r') except getopt.error, msg: usage(2, msg) --- 390,394 ---- """Main program; parse options and go.""" try: ! opts, args = getopt.getopt(sys.argv[1:], 'hdDfg:s:p:u:r') except getopt.error, msg: usage(2, msg) *************** *** 390,394 **** unknown = [] reverse = 0 ! do_filter = usedb = False for opt, arg in opts: if opt == '-h': --- 402,407 ---- unknown = [] reverse = 0 ! do_filter = False ! usedb = USEDB for opt, arg in opts: if opt == '-h': *************** *** 402,405 **** --- 415,420 ---- elif opt == "-d": usedb = True + elif opt == "-D": + usedb = False elif opt == "-f": do_filter = True Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.59 retrieving revision 1.60 diff -C2 -d -r1.59 -r1.60 *** Options.py 27 Oct 2002 05:26:01 -0000 1.59 --- Options.py 27 Oct 2002 22:07:42 -0000 1.60 *************** *** 49,53 **** # header line, in a case-sensitive way. # ! # Depending on data collection, some headers aren't safe to count. # For example, if ham is collected from a mailing list but spam from your # regular inbox traffic, the presence of a header like List-Info will be a --- 49,53 ---- # header line, in a case-sensitive way. # ! # Depending on data collection, some headers are not safe to count. # For example, if ham is collected from a mailing list but spam from your # regular inbox traffic, the presence of a header like List-Info will be a *************** *** 151,155 **** # The idea is that if something scores < hamc, it's called ham; if # something scores >= spamc, it's called spam; and everything else is ! # called "I'm not sure" -- the middle ground. # # Note that cvcost.py does a similar analysis. --- 151,155 ---- # The idea is that if something scores < hamc, it's called ham; if # something scores >= spamc, it's called spam; and everything else is ! # called 'I am not sure' -- the middle ground. # # Note that cvcost.py does a similar analysis. *************** *** 170,174 **** # Display spam when # show_spam_lo <= spamprob <= show_spam_hi ! # and likewise for ham. The defaults here don't show anything. show_spam_lo: 1.0 show_spam_hi: 0.0 --- 170,174 ---- # Display spam when # show_spam_lo <= spamprob <= show_spam_hi ! # and likewise for ham. The defaults here do not show anything. show_spam_lo: 1.0 show_spam_hi: 0.0 *************** *** 180,185 **** show_unsure: False ! # Near the end of Driver.test(), you can get a listing of the 'best ! # discriminators' in the words from the training sets. These are the # words whose WordInfo.killcount values are highest, meaning they most # often were among the most extreme clues spamprob() found. The number --- 180,185 ---- show_unsure: False ! # Near the end of Driver.test(), you can get a listing of the best ! # discriminators in the words from the training sets. These are the # words whose WordInfo.killcount values are highest, meaning they most # often were among the most extreme clues spamprob() found. The number *************** *** 197,201 **** # appended to pickle_basename. By default (if save_trained_pickles is # true), the filenames are class1.pik, class2.pik, ... If a file of that ! # name already exists, it's overwritten. pickle_basename is ignored when # save_trained_pickles is false. --- 197,201 ---- # appended to pickle_basename. By default (if save_trained_pickles is # true), the filenames are class1.pik, class2.pik, ... If a file of that ! # name already exists, it is overwritten. pickle_basename is ignored when # save_trained_pickles is false. *************** *** 219,225 **** # on. By default, it does this in a clever way, learning *and* unlearning # sets as it goes along, so that it never needs to train on N-1 sets in one ! # gulp after the first time. Setting this option true forces "one gulp ! # from-scratch" training every time. There used to be a set of combining ! # schemes that needed this, but now it's just in case you're paranoid . build_each_classifier_from_scratch: False --- 219,225 ---- # on. By default, it does this in a clever way, learning *and* unlearning # sets as it goes along, so that it never needs to train on N-1 sets in one ! # gulp after the first time. Setting this option true forces ''one gulp ! # from-scratch'' training every time. There used to be a set of combining ! # schemes that needed this, but now it is just in case you are paranoid . build_each_classifier_from_scratch: False *************** *** 231,235 **** # These two control the prior assumption about word probabilities. ! # "x" is essentially the probability given to a word that's never been # seen before. Nobody has reported an improvement via moving it away # from 1/2. --- 231,235 ---- # These two control the prior assumption about word probabilities. ! # "x" is essentially the probability given to a word that has never been # seen before. Nobody has reported an improvement via moving it away # from 1/2. *************** *** 237,243 **** # the probabilities estimated by counting. At s=0, the counting estimates # are believed 100%, even to the extent of assigning certainty (0 or 1) ! # to a word that's appeared in only ham or only spam. This is a disaster. # As s tends toward infintity, all probabilities tend toward x. All ! # reports were that a value near 0.4 worked best, so this doesn't seem to # be corpus-dependent. # NOTE: Gary Robinson previously used a different formula involving 'a' --- 237,243 ---- # the probabilities estimated by counting. At s=0, the counting estimates # are believed 100%, even to the extent of assigning certainty (0 or 1) ! # to a word that has appeared in only ham or only spam. This is a disaster. # As s tends toward infintity, all probabilities tend toward x. All ! # reports were that a value near 0.4 worked best, so this does not seem to # be corpus-dependent. # NOTE: Gary Robinson previously used a different formula involving 'a' *************** *** 250,258 **** # abs(word.spamprob - 0.5) < robinson_minimum_prob_strength. # This may be a hack, but it has proved to reduce error rates in many ! # tests over Robinson's base scheme. 0.1 appeared to work well across # all corpora. robinson_minimum_prob_strength: 0.1 ! # The combining scheme currently detailed on Gary Robinon's web page. # The middle ground here is touchy, varying across corpus, and within # a corpus across amounts of training data. It almost never gives extreme --- 250,258 ---- # abs(word.spamprob - 0.5) < robinson_minimum_prob_strength. # This may be a hack, but it has proved to reduce error rates in many ! # tests over Robinsons base scheme. 0.1 appeared to work well across # all corpora. robinson_minimum_prob_strength: 0.1 ! # The combining scheme currently detailed on Gary Robinons web page. # The middle ground here is touchy, varying across corpus, and within # a corpus across amounts of training data. It almost never gives extreme *************** *** 262,274 **** # For vectors of random, uniformly distributed probabilities, -2*sum(ln(p_i)) ! # follows the chi-squared distribution with 2*n degrees of freedom. That's ! # the "provably most-sensitive" test Gary's original scheme was monotonic # with. Getting closer to the theoretical basis appears to give an excellent # combining method, usually very extreme in its judgment, yet finding a tiny # (in # of msgs, spread across a huge range of scores) middle ground where ! # lots of the mistakes live. This is the best method so far on Tim's data. ! # One systematic benefit is that it's immune to "cancellation disease". One ! # systematic drawback is that it's sensitive to *any* deviation from a ! # uniform distribution, regardless of whether that's actually evidence of # ham or spam. Rob Hooft alleviated that by combining the final S and H # measures via (S-H+1)/2 instead of via S/(S+H)). --- 262,274 ---- # For vectors of random, uniformly distributed probabilities, -2*sum(ln(p_i)) ! # follows the chi-squared distribution with 2*n degrees of freedom. That is ! # the "provably most-sensitive" test Garys original scheme was monotonic # with. Getting closer to the theoretical basis appears to give an excellent # combining method, usually very extreme in its judgment, yet finding a tiny # (in # of msgs, spread across a huge range of scores) middle ground where ! # lots of the mistakes live. This is the best method so far on Tims data. ! # One systematic benefit is that it is immune to "cancellation disease". One ! # systematic drawback is that it is sensitive to *any* deviation from a ! # uniform distribution, regardless of whether that is actually evidence of # ham or spam. Rob Hooft alleviated that by combining the final S and H # measures via (S-H+1)/2 instead of via S/(S+H)). *************** *** 279,282 **** --- 279,302 ---- # (original c.l.p data, his own email, and newer general python.org traffic). use_chi_squared_combining: True + + [Hammie] + # The name of the header that hammie adds to an E-mail in filter mode + hammie_header_name: X-Hammie-Disposition + + # The default database path used by hammie + persistant_storage_file: hammie.db + + # The range of clues that are added to the "hammie" header in the E-mail + # All clues that have their probability smaller than this number, or larger + # than one minus this number are added to the header such that you can see + # why spambayes thinks this is ham/spam or why it is unsure. The default is + # to show all clues, but you can reduce that by setting showclue to a lower + # value, such as 0.1 (which Rob is using) + clue_mailheader_cutoff: 0.5 + + # hammie can use either a database (quick to score one message) or a pickle + # (quick to train on huge amounts of messages). Set this to True to use a + # database by default. + persistant_use_database: False """ *************** *** 334,337 **** --- 354,363 ---- 'use_chi_squared_combining': boolean_cracker, }, + 'Hammie': {'hammie_header_name': string_cracker, + 'persistant_storage_file': string_cracker, + 'clue_mailheader_cutoff': float_cracker, + 'persistant_use_database': boolean_cracker, + }, + } From montanaro@users.sourceforge.net Sun Oct 27 22:10:30 2002 From: montanaro@users.sourceforge.net (Skip Montanaro) Date: Sun, 27 Oct 2002 14:10:30 -0800 Subject: [Spambayes-checkins] spambayes unheader.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv24030 Modified Files: unheader.py Log Message: keep replace_header in better sync with what Barry's got in the 2.3 source Index: unheader.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/unheader.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** unheader.py 25 Sep 2002 17:56:09 -0000 1.7 --- unheader.py 27 Oct 2002 22:10:28 -0000 1.8 *************** *** 17,28 **** del msg[hdr] class Message(email.Message.Message): ! def replace_header(self, hdr, newval): ! """replace first value for hdr with newval""" ! hdr = hdr.lower() ! for i in range(len(self._headers)): ! k, v = self._headers[i] ! if k.lower() == hdr: ! self._headers[i] = (k, newval) class Parser(email.Parser.HeaderParser): --- 17,36 ---- del msg[hdr] + # remain compatible with 2.2.1 - steal replace_header from 2.3 source class Message(email.Message.Message): ! def replace_header(self, _name, _value): ! """Replace a header. ! ! Replace the first matching header found in the message, retaining ! header order and case. If no matching header was found, a ! KeyError is raised. ! """ ! _name = _name.lower() ! for i, (k, v) in zip(range(len(self._headers)), self._headers): ! if k.lower() == _name: ! self._headers[i] = (k, _value) ! break ! else: ! raise KeyError, _name class Parser(email.Parser.HeaderParser): From tim_one@users.sourceforge.net Sun Oct 27 22:34:11 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 27 Oct 2002 14:34:11 -0800 Subject: [Spambayes-checkins] spambayes tokenizer.py,1.51,1.52 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv1522 Modified Files: tokenizer.py Log Message: The extract_dow option generated 'dow:invalid' for every msg with a Date header, due to an else clause on a loop that had no early exit. Supplied the intended early exit. Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.51 retrieving revision 1.52 diff -C2 -d -r1.51 -r1.52 *** tokenizer.py 27 Oct 2002 05:26:01 -0000 1.51 --- tokenizer.py 27 Oct 2002 22:34:08 -0000 1.52 *************** *** 1014,1017 **** --- 1014,1018 ---- else: yield 'dow:%d' % timetuple[6] + break else: # if nothing matches, declare the Date: header invalid From tim_one@users.sourceforge.net Sun Oct 27 22:56:18 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 27 Oct 2002 14:56:18 -0800 Subject: [Spambayes-checkins] spambayes Options.py,1.60,1.61 hammie.py,1.32,1.33 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv18295 Modified Files: Options.py hammie.py Log Message: s/persistant/persistent/g in new code. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.60 retrieving revision 1.61 diff -C2 -d -r1.60 -r1.61 *** Options.py 27 Oct 2002 22:07:42 -0000 1.60 --- Options.py 27 Oct 2002 22:56:15 -0000 1.61 *************** *** 285,289 **** # The default database path used by hammie ! persistant_storage_file: hammie.db # The range of clues that are added to the "hammie" header in the E-mail --- 285,289 ---- # The default database path used by hammie ! persistent_storage_file: hammie.db # The range of clues that are added to the "hammie" header in the E-mail *************** *** 298,302 **** # (quick to train on huge amounts of messages). Set this to True to use a # database by default. ! persistant_use_database: False """ --- 298,302 ---- # (quick to train on huge amounts of messages). Set this to True to use a # database by default. ! persistent_use_database: False """ *************** *** 355,361 **** }, 'Hammie': {'hammie_header_name': string_cracker, ! 'persistant_storage_file': string_cracker, 'clue_mailheader_cutoff': float_cracker, ! 'persistant_use_database': boolean_cracker, }, --- 355,361 ---- }, 'Hammie': {'hammie_header_name': string_cracker, ! 'persistent_storage_file': string_cracker, 'clue_mailheader_cutoff': float_cracker, ! 'persistent_use_database': boolean_cracker, }, Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammie.py,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** hammie.py 27 Oct 2002 22:07:42 -0000 1.32 --- hammie.py 27 Oct 2002 22:56:15 -0000 1.33 *************** *** 59,63 **** # Default database name ! DEFAULTDB = options.persistant_storage_file # Probability at which a message is considered spam --- 59,63 ---- # Default database name ! DEFAULTDB = options.persistent_storage_file # Probability at which a message is considered spam *************** *** 69,73 **** # Use a database? If False, use a pickle ! USEDB = options.persistant_use_database # Tim's tokenizer kicks far more booty than anything I would have --- 69,73 ---- # Use a database? If False, use a pickle ! USEDB = options.persistent_use_database # Tim's tokenizer kicks far more booty than anything I would have From tim_one@users.sourceforge.net Mon Oct 28 05:15:53 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Sun, 27 Oct 2002 21:15:53 -0800 Subject: [Spambayes-checkins] spambayes/Outlook2000 msgstore.py,1.11,1.12 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory usw-pr-cvs1:/tmp/cvs-serv15254/Outlook2000 Modified Files: msgstore.py Log Message: Split lots of long lines, and added (probably too many!) comments to _FindDefaultMessageStore(). I'm trying to figure out why something doesn't work: the folder trees displayed in the training and classify dialogs allow selecting folders from any number of .pst files, but selecting one from anything other than "my main" .pst file doesn't work (a thread dies with an exception upon hitting the "Train now" or "Classify now" button, and before that "" shows up in the display). I don't really know what "my main .pst file" means at the code level, but *suspect* it's what _FindDefaultMessageStore() is figuring out, and that the MAPIMsgStore class is *always* trying to use that .pst file as a result, and that the folder IDs found for folders in other .pst files by the dialogs simply don't make any sense when passed to a MAPIMsgStore instance later. I'm keen to get this to work for two reasons: 1. So I can move 40MB of old spam out of my main .pst file and into a new Training.pst file (I won't have to burn hours transferring that one between my desktop and laptop). 2. I have a dozen other .pst files with archives of older email, and I'd like to access that for training too. I suspect this is more difficult than I imagine . Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** msgstore.py 26 Oct 2002 05:30:39 -0000 1.11 --- msgstore.py 28 Oct 2002 05:15:50 -0000 1.12 *************** *** 92,96 **** cwd = os.getcwd() mapi.MAPIInitialize(None) ! logonFlags = mapi.MAPI_NO_MAIL | mapi.MAPI_EXTENDED | mapi.MAPI_USE_DEFAULT self.session = mapi.MAPILogonEx(0, None, None, logonFlags) self._FindDefaultMessageStore() --- 92,98 ---- cwd = os.getcwd() mapi.MAPIInitialize(None) ! logonFlags = (mapi.MAPI_NO_MAIL | ! mapi.MAPI_EXTENDED | ! mapi.MAPI_USE_DEFAULT) self.session = mapi.MAPILogonEx(0, None, None, logonFlags) self._FindDefaultMessageStore() *************** *** 99,103 **** def Close(self): self.mapi_msgstore = None ! self.session.Logoff(0,0,0) self.session = None mapi.MAPIUninitialize() --- 101,105 ---- def Close(self): self.mapi_msgstore = None ! self.session.Logoff(0, 0, 0) self.session = None mapi.MAPIUninitialize() *************** *** 105,125 **** def _FindDefaultMessageStore(self): tab = self.session.GetMsgStoresTable(0) ! # restriction for the table. ! restriction = mapi.RES_PROPERTY, (mapi.RELOP_EQ, PR_DEFAULT_STORE, (PR_DEFAULT_STORE, True)) ! rows = mapi.HrQueryAllRows(tab, (PR_ENTRYID,), restriction, None, 0) ! # get first entry row = rows[0] eid_tag, eid = row[0] # Open the store. ! self.mapi_msgstore = self.session.OpenMsgStore(0, eid, None, mapi.MDB_WRITE | mapi.MDB_NO_MAIL | USE_DEFERRED_ERRORS ) def _GetSubFolderIter(self, folder): table = folder.GetHierarchyTable(0) ! rows = mapi.HrQueryAllRows(table, (PR_ENTRYID,PR_DISPLAY_NAME_A), None, None, 0) for (eid_tag, eid),(name_tag, name) in rows: ! sub = self.mapi_msgstore.OpenEntry(eid, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) table = sub.GetContentsTable(0) yield MAPIMsgStoreFolder(self, eid, name, table.GetRowCount(0)) ! folder = self.mapi_msgstore.OpenEntry(eid, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) for store_folder in self._GetSubFolderIter(folder): yield store_folder --- 107,153 ---- def _FindDefaultMessageStore(self): tab = self.session.GetMsgStoresTable(0) ! # Restriction for the table: get rows where PR_DEFAULT_STORE is true. ! # There should be only one. ! restriction = (mapi.RES_PROPERTY, # a property restriction ! (mapi.RELOP_EQ, # check for equality ! PR_DEFAULT_STORE, # of the PR_DEFAULT_STORE prop ! (PR_DEFAULT_STORE, True))) # with True ! rows = mapi.HrQueryAllRows(tab, ! (PR_ENTRYID,), # columns to retrieve ! restriction, # only these rows ! None, # any sort order is fine ! 0) # any # of results is fine ! # get first entry, a (property_tag, value) pair, for PR_ENTRYID row = rows[0] eid_tag, eid = row[0] # Open the store. ! self.mapi_msgstore = self.session.OpenMsgStore( ! 0, # no parent window ! eid, # msg store to open ! None, # IID; accept default IMsgStore ! # need write access to add score fields ! mapi.MDB_WRITE | ! # we won't send or receive email ! mapi.MDB_NO_MAIL | ! USE_DEFERRED_ERRORS) def _GetSubFolderIter(self, folder): table = folder.GetHierarchyTable(0) ! rows = mapi.HrQueryAllRows(table, ! (PR_ENTRYID, PR_DISPLAY_NAME_A), ! None, ! None, ! 0) for (eid_tag, eid),(name_tag, name) in rows: ! sub = self.mapi_msgstore.OpenEntry(eid, ! None, ! mapi.MAPI_MODIFY | ! USE_DEFERRED_ERRORS) table = sub.GetContentsTable(0) yield MAPIMsgStoreFolder(self, eid, name, table.GetRowCount(0)) ! folder = self.mapi_msgstore.OpenEntry(eid, ! None, ! mapi.MAPI_MODIFY | ! USE_DEFERRED_ERRORS) for store_folder in self._GetSubFolderIter(folder): yield store_folder *************** *** 128,135 **** for folder_id in folder_ids: folder_id = mapi.BinFromHex(folder_id) ! folder = self.mapi_msgstore.OpenEntry(folder_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) table = folder.GetContentsTable(0) rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0) ! yield MAPIMsgStoreFolder(self, folder_id, props[0][1], table.GetRowCount(0)) if include_sub: for f in self._GetSubFolderIter(folder): --- 156,167 ---- for folder_id in folder_ids: folder_id = mapi.BinFromHex(folder_id) ! folder = self.mapi_msgstore.OpenEntry(folder_id, ! None, ! mapi.MAPI_MODIFY | ! USE_DEFERRED_ERRORS) table = folder.GetContentsTable(0) rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0) ! yield MAPIMsgStoreFolder(self, folder_id, props[0][1], ! table.GetRowCount(0)) if include_sub: for f in self._GetSubFolderIter(folder): *************** *** 139,146 **** # Return a single folder given the ID. folder_id = mapi.BinFromHex(folder_id) ! folder = self.mapi_msgstore.OpenEntry(folder_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) table = folder.GetContentsTable(0) rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0) ! return MAPIMsgStoreFolder(self, folder_id, props[0][1], table.GetRowCount(0)) def GetMessage(self, message_id): --- 171,182 ---- # Return a single folder given the ID. folder_id = mapi.BinFromHex(folder_id) ! folder = self.mapi_msgstore.OpenEntry(folder_id, ! None, ! mapi.MAPI_MODIFY | ! USE_DEFERRED_ERRORS) table = folder.GetContentsTable(0) rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0) ! return MAPIMsgStoreFolder(self, folder_id, props[0][1], ! table.GetRowCount(0)) def GetMessage(self, message_id): *************** *** 148,156 **** message_id = mapi.BinFromHex(message_id) prop_ids = PR_PARENT_ENTRYID, PR_CONTENT_UNREAD ! mapi_object = self.mapi_msgstore.OpenEntry(message_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) hr, data = mapi_object.GetProps(prop_ids,0) folder_eid = data[0][1] unread = data[1][1] ! folder = MAPIMsgStoreFolder(self, folder_eid, "Unknown - temp message", -1) return MAPIMsgStoreMsg(self, folder, message_id, unread) --- 184,196 ---- message_id = mapi.BinFromHex(message_id) prop_ids = PR_PARENT_ENTRYID, PR_CONTENT_UNREAD ! mapi_object = self.mapi_msgstore.OpenEntry(message_id, ! None, ! mapi.MAPI_MODIFY | ! USE_DEFERRED_ERRORS) hr, data = mapi_object.GetProps(prop_ids,0) folder_eid = data[0][1] unread = data[1][1] ! folder = MAPIMsgStoreFolder(self, folder_eid, ! "Unknown - temp message", -1) return MAPIMsgStoreMsg(self, folder, message_id, unread) *************** *** 180,184 **** def __repr__(self): ! return "<%s '%s' (%d items), id=%s>" % (self.__class__.__name__, self.name, self.count, mapi.HexFromBin(self.id)) def GetOutlookEntryID(self): --- 220,227 ---- def __repr__(self): ! return "<%s '%s' (%d items), id=%s>" % (self.__class__.__name__, ! self.name, ! self.count, ! mapi.HexFromBin(self.id)) def GetOutlookEntryID(self): *************** *** 186,200 **** def GetMessageGenerator(self): ! folder = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) table = folder.GetContentsTable(0) prop_ids = PR_ENTRYID, PR_CONTENT_UNREAD table.SetColumns(prop_ids, 0) while 1: ! # Getting 70 at a time was the random number that gave best perf for me ;) rows = table.QueryRows(70, 0) ! if len(rows)==0: break for row in rows: ! yield MAPIMsgStoreMsg(self.msgstore, self, row[0][1], row[1][1]) --- 229,248 ---- def GetMessageGenerator(self): ! folder = self.msgstore.mapi_msgstore.OpenEntry(self.id, ! None, ! mapi.MAPI_MODIFY | ! USE_DEFERRED_ERRORS) table = folder.GetContentsTable(0) prop_ids = PR_ENTRYID, PR_CONTENT_UNREAD table.SetColumns(prop_ids, 0) while 1: ! # Getting 70 at a time was the random number that gave best ! # perf for me ;) rows = table.QueryRows(70, 0) ! if len(rows) == 0: break for row in rows: ! yield MAPIMsgStoreMsg(self.msgstore, self, ! row[0][1], row[1][1]) *************** *** 213,217 **** else: urs = "unread" ! return "<%s, (%s) id=%s>" % (self.__class__.__name__, urs, mapi.HexFromBin(self.id)) def GetOutlookEntryID(self): --- 261,267 ---- else: urs = "unread" ! return "<%s, (%s) id=%s>" % (self.__class__.__name__, ! urs, ! mapi.HexFromBin(self.id)) def GetOutlookEntryID(self): *************** *** 220,228 **** def _GetPropFromStream(self, prop_id): try: ! stream = self.mapi_object.OpenProperty(prop_id, pythoncom.IID_IStream, 0, 0) chunks = [] while 1: chunk = stream.Read(1024) ! if not chunk: break chunks.append(chunk) return "".join(chunks) --- 270,281 ---- def _GetPropFromStream(self, prop_id): try: ! stream = self.mapi_object.OpenProperty(prop_id, ! pythoncom.IID_IStream, ! 0, 0) chunks = [] while 1: chunk = stream.Read(1024) ! if not chunk: ! break chunks.append(chunk) return "".join(chunks) *************** *** 233,241 **** def _GetPotentiallyLargeStringProp(self, prop_id, row): got_tag, got_val = row ! if PROP_TYPE(got_tag)==PT_ERROR: ret = "" ! if got_val==mapi.MAPI_E_NOT_FOUND: pass # No body for this message. ! elif got_val==mapi.MAPI_E_NOT_ENOUGH_MEMORY: # Too big for simple properties - get via a stream ret = self._GetPropFromStream(prop_id) --- 286,294 ---- def _GetPotentiallyLargeStringProp(self, prop_id, row): got_tag, got_val = row ! if PROP_TYPE(got_tag) == PT_ERROR: ret = "" ! if got_val == mapi.MAPI_E_NOT_FOUND: pass # No body for this message. ! elif got_val == mapi.MAPI_E_NOT_ENOUGH_MEMORY: # Too big for simple properties - get via a stream ret = self._GetPropFromStream(prop_id) *************** *** 243,247 **** tag_name = mapiutil.GetPropTagName(prop_id) err_string = mapiutil.GetScodeString(got_val) ! print "Warning - failed to get property %s: %s" % (tag_name, err_string) else: ret = got_val --- 296,301 ---- tag_name = mapiutil.GetPropTagName(prop_id) err_string = mapiutil.GetScodeString(got_val) ! print "Warning - failed to get property %s: %s" % (tag_name, ! err_string) else: ret = got_val *************** *** 274,278 **** def _EnsureObject(self): if self.mapi_object is None: ! self.mapi_object = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) def GetEmailPackageObject(self, strip_mime_headers=True): --- 328,335 ---- def _EnsureObject(self): if self.mapi_object is None: ! self.mapi_object = self.msgstore.mapi_msgstore.OpenEntry( ! self.id, ! None, ! mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) def GetEmailPackageObject(self, strip_mime_headers=True): *************** *** 324,329 **** type_tag = _MapiTypeMap.get(type(val)) if type_tag is None: ! raise ValueError, "Dont know what to do with '%r' ('%s')" % (val, type(val)) ! prop = PROP_TAG( type_tag, PROP_ID(propIds[0])) if val is None: # Delete the property --- 381,387 ---- type_tag = _MapiTypeMap.get(type(val)) if type_tag is None: ! raise ValueError, "Don't know what to do with '%r' ('%s')" % ( ! val, type(val)) ! prop = PROP_TAG(type_tag, PROP_ID(propIds[0])) if val is None: # Delete the property *************** *** 340,349 **** def _DoCopyMode(self, folder, isMove): ## self.mapi_object = None # release the COM pointer ! assert not self.dirty, "asking me to move a dirty message - later saves will fail!" ! dest_folder = self.msgstore.mapi_msgstore.OpenEntry(folder.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) ! source_folder = self.msgstore.mapi_msgstore.OpenEntry(self.folder.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) flags = 0 if isMove: flags |= MESSAGE_MOVE ! source_folder.CopyMessages( (self.id,), None, dest_folder, 0, None, flags) self.folder = self.msgstore.GetFolder(mapi.HexFromBin(folder.id)) --- 398,419 ---- def _DoCopyMode(self, folder, isMove): ## self.mapi_object = None # release the COM pointer ! assert not self.dirty, \ ! "asking me to move a dirty message - later saves will fail!" ! dest_folder = self.msgstore.mapi_msgstore.OpenEntry( ! folder.id, ! None, ! mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) ! source_folder = self.msgstore.mapi_msgstore.OpenEntry( ! self.folder.id, ! None, ! mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS) flags = 0 if isMove: flags |= MESSAGE_MOVE ! source_folder.CopyMessages((self.id,), ! None, ! dest_folder, ! 0, ! None, ! flags) self.folder = self.msgstore.GetFolder(mapi.HexFromBin(folder.id)) From anthonybaxter@users.sourceforge.net Mon Oct 28 07:04:15 2002 From: anthonybaxter@users.sourceforge.net (Anthony Baxter) Date: Sun, 27 Oct 2002 23:04:15 -0800 Subject: [Spambayes-checkins] spambayes Options.py,1.61,1.62 tokenizer.py,1.52,1.53 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv23052 Modified Files: Options.py tokenizer.py Log Message: Added skip_max_word_size as an option, to specify how long a word has to be before it's skipped. I find that boosting from 12 (the default) to 20 makes a significant improvement in the number of 'unsure' messages. see my post to the list for more. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.61 retrieving revision 1.62 diff -C2 -d -r1.61 -r1.62 *** Options.py 27 Oct 2002 22:56:15 -0000 1.61 --- Options.py 28 Oct 2002 07:04:12 -0000 1.62 *************** *** 91,94 **** --- 91,100 ---- generate_long_skips: True + # + # Length of words that triggers 'long skips'. Longer than this + # triggers a skip. + # + skip_max_word_size: 12 + # Generate tokens which resemble the posting time in 6-minute buckets: # int((h*60+m)/10). *************** *** 170,174 **** # Display spam when # show_spam_lo <= spamprob <= show_spam_hi ! # and likewise for ham. The defaults here do not show anything. show_spam_lo: 1.0 show_spam_hi: 0.0 --- 176,180 ---- # Display spam when # show_spam_lo <= spamprob <= show_spam_hi ! # and likewise for ham. The defaults here do not show anything. show_spam_lo: 1.0 show_spam_hi: 0.0 *************** *** 311,314 **** --- 317,321 ---- 'count_all_header_lines': boolean_cracker, 'generate_long_skips': boolean_cracker, + 'skip_max_word_size': int_cracker, 'extract_dow': boolean_cracker, 'generate_time_buckets': boolean_cracker, Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.52 retrieving revision 1.53 diff -C2 -d -r1.52 -r1.53 *** tokenizer.py 27 Oct 2002 22:34:08 -0000 1.52 --- tokenizer.py 28 Oct 2002 07:04:12 -0000 1.53 *************** *** 589,596 **** yield "fname piece:" + piece ! def tokenize_word(word, _len=len): n = _len(word) # Make sure this range matches in tokenize(). ! if 3 <= n <= 12: yield word --- 589,596 ---- yield "fname piece:" + piece ! def tokenize_word(word, _len=len, maxword=options.skip_max_word_size): n = _len(word) # Make sure this range matches in tokenize(). ! if 3 <= n <= maxword: yield word From tim.one@comcast.net Mon Oct 28 07:20:26 2002 From: tim.one@comcast.net (Tim Peters) Date: Mon, 28 Oct 2002 02:20:26 -0500 Subject: [Spambayes-checkins] spambayes Options.py,1.61,1.62tokenizer.py,1.52,1.53 In-Reply-To: Message-ID: [Anthony Baxter] > Modified Files: > Options.py tokenizer.py > Log Message: > Added skip_max_word_size as an option, to specify how long a word > has to be before it's skipped. I find that boosting from 12 (the default) > to 20 makes a significant improvement in the number of 'unsure' messages. > see my post to the list for more. This patch is confused, as it left the hardcoded 12 in tokenize_body() alone. You should also update the "How big should 'a word' be?" comment block if you've found that boosting this helped. From anthonybaxter@users.sourceforge.net Mon Oct 28 07:42:05 2002 From: anthonybaxter@users.sourceforge.net (Anthony Baxter) Date: Sun, 27 Oct 2002 23:42:05 -0800 Subject: [Spambayes-checkins] spambayes tokenizer.py,1.53,1.54 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv32174 Modified Files: tokenizer.py Log Message: woops. cvs merge messup. Other half of the patch. Thanks tim :) Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** tokenizer.py 28 Oct 2002 07:04:12 -0000 1.53 --- tokenizer.py 28 Oct 2002 07:42:03 -0000 1.54 *************** *** 512,515 **** --- 512,519 ---- # worse idea: f-p and f-n rates both suffered significantly then. I didn't # try testing with lower bound 2. + # + # Anthony Baxter found that boosting the option skip_max_word_size to 20 + # from it's default of 12 produced a quite dramatic decrease in the number + # of 'unsure' messages. *************** *** 1050,1054 **** yield "header:%s:%d" % x ! def tokenize_body(self, msg): """Generate a stream of tokens from an email Message. --- 1054,1058 ---- yield "header:%s:%d" % x ! def tokenize_body(self, msg, maxword=options.skip_max_word_size): """Generate a stream of tokens from an email Message. *************** *** 1118,1122 **** n = len(w) # Make sure this range matches in tokenize_word(). ! if 3 <= n <= 12: yield w --- 1122,1126 ---- n = len(w) # Make sure this range matches in tokenize_word(). ! if 3 <= n <= maxword: yield w From tim_one@users.sourceforge.net Mon Oct 28 17:07:36 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 28 Oct 2002 09:07:36 -0800 Subject: [Spambayes-checkins] spambayes tokenizer.py,1.54,1.55 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv8772 Modified Files: tokenizer.py Log Message: Record my and Rob's skip_max_word_size results. We need more info from Anthony -- this doesn't look like a winner. Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.54 retrieving revision 1.55 diff -C2 -d -r1.54 -r1.55 *** tokenizer.py 28 Oct 2002 07:42:03 -0000 1.54 --- tokenizer.py 28 Oct 2002 17:07:28 -0000 1.55 *************** *** 512,519 **** # worse idea: f-p and f-n rates both suffered significantly then. I didn't # try testing with lower bound 2. ! # # Anthony Baxter found that boosting the option skip_max_word_size to 20 ! # from it's default of 12 produced a quite dramatic decrease in the number ! # of 'unsure' messages. --- 512,583 ---- # worse idea: f-p and f-n rates both suffered significantly then. I didn't # try testing with lower bound 2. ! # # Anthony Baxter found that boosting the option skip_max_word_size to 20 ! # from its default of 12 produced a quite dramatic decrease in the number ! # of 'unsure' messages. However, this was coupled with a large increase ! # in the FN rate, and it remains unclear whether simply shifting cutoffs ! # would have given the same tradeoff (not enough data was posted to tell). ! # ! # On Tim's c.l.py test, 10-fold CV, ham_cutoff=0.20 and spam_cutoff=0.80: ! # ! # -> tested 2000 hams & 1400 spams against 18000 hams & 12600 spams ! # [ditto] ! # ! # filename: max12 max20 ! # ham:spam: 20000:14000 ! # 20000:14000 ! # fp total: 2 2 the same ! # fp %: 0.01 0.01 ! # fn total: 0 0 the same ! # fn %: 0.00 0.00 ! # unsure t: 103 100 slight decrease ! # unsure %: 0.30 0.29 ! # real cost: $40.60 $40.00 slight improvement with these cutoffs ! # best cost: $27.00 $27.40 best possible got slightly worse ! # h mean: 0.28 0.27 ! # h sdev: 2.99 2.92 ! # s mean: 99.94 99.93 ! # s sdev: 1.41 1.47 ! # mean diff: 99.66 99.66 ! # k: 22.65 22.70 ! # ! # "Best possible" in max20 would have been to boost ham_cutoff to 0.50(!), ! # and drop spam_cutoff a little to 0.78. This would have traded away most ! # of the unsures in return for letting 3 spam through: ! # ! # -> smallest ham & spam cutoffs 0.5 & 0.78 ! # -> fp 2; fn 3; unsure ham 11; unsure spam 11 ! # -> fp rate 0.01%; fn rate 0.0214%; unsure rate 0.0647% ! # ! # Best possible in max12 was much the same: ! # ! # -> largest ham & spam cutoffs 0.5 & 0.78 ! # -> fp 2; fn 3; unsure ham 12; unsure spam 8 ! # -> fp rate 0.01%; fn rate 0.0214%; unsure rate 0.0588% ! # ! # The classifier pickle size increased by about 1.5 MB (~8.4% bigger). ! # ! # Rob Hooft's results were worse: ! # ! # -> tested 1600 hams & 580 spams against 14400 hams & 5220 spams ! # [...] ! # -> tested 1600 hams & 580 spams against 14400 hams & 5220 spams ! # filename: skip12 skip20 ! # ham:spam: 16000:5800 ! # 16000:5800 ! # fp total: 12 13 ! # fp %: 0.07 0.08 ! # fn total: 7 7 ! # fn %: 0.12 0.12 ! # unsure t: 178 184 ! # unsure %: 0.82 0.84 ! # real cost: $162.60 $173.80 ! # best cost: $106.20 $109.60 ! # h mean: 0.51 0.52 ! # h sdev: 4.87 4.92 ! # s mean: 99.42 99.39 ! # s sdev: 5.22 5.34 ! # mean diff: 98.91 98.87 ! # k: 9.80 9.64 From tim_one@users.sourceforge.net Mon Oct 28 20:19:52 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 28 Oct 2002 12:19:52 -0800 Subject: [Spambayes-checkins] spambayes Options.py,1.62,1.63 tokenizer.py,1.55,1.56 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv14627 Modified Files: Options.py tokenizer.py Log Message: Repaired comments about what generate_time_buckets does. Purged reference to now-gone cvcost.py. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.62 retrieving revision 1.63 diff -C2 -d -r1.62 -r1.63 *** Options.py 28 Oct 2002 07:04:12 -0000 1.62 --- Options.py 28 Oct 2002 20:19:46 -0000 1.63 *************** *** 97,102 **** skip_max_word_size: 12 ! # Generate tokens which resemble the posting time in 6-minute buckets: ! # int((h*60+m)/10). generate_time_buckets: False --- 97,102 ---- skip_max_word_size: 12 ! # Generate tokens which resemble the posting time in 10-minute buckets: ! # 'time:' hour ':' minute//10 generate_time_buckets: False *************** *** 158,163 **** # something scores >= spamc, it's called spam; and everything else is # called 'I am not sure' -- the middle ground. - # - # Note that cvcost.py does a similar analysis. # # Note: You may wish to increase nbuckets, to give this scheme more cutoff --- 158,161 ---- Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.55 retrieving revision 1.56 diff -C2 -d -r1.55 -r1.56 *** tokenizer.py 28 Oct 2002 17:07:28 -0000 1.55 --- tokenizer.py 28 Oct 2002 20:19:47 -0000 1.56 *************** *** 1066,1070 **** mat = self.date_hms_re.search(header) # return the time in Date: headers arranged in ! # six-minute buckets if mat is not None: h = int(mat.group('hour')) --- 1066,1070 ---- mat = self.date_hms_re.search(header) # return the time in Date: headers arranged in ! # 10-minute buckets if mat is not None: h = int(mat.group('hour')) From tim_one@users.sourceforge.net Tue Oct 29 03:44:00 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Mon, 28 Oct 2002 19:44:00 -0800 Subject: [Spambayes-checkins] spambayes tokenizer.py,1.56,1.57 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv24468 Modified Files: tokenizer.py Log Message: Try to repair the case where legit base64 is followed by random plain text. Python's base64 decoder is actually extremely forgiving, so much so that it skips over any number of garbage lines looking for more base64 to decode. So when the base64 part *doesn't* end with '=' padding, it goes nuts, effectively be too forgiving. Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** tokenizer.py 28 Oct 2002 20:19:47 -0000 1.56 --- tokenizer.py 29 Oct 2002 03:43:58 -0000 1.57 *************** *** 814,817 **** --- 814,859 ---- yield 'content-transfer-encoding:' + x.lower() + # The base64 decoder is actually very forgiving, but flubs one case: + # if no padding is required (no trailing '='), it continues to read + # following lines as if they were still part of the base64 part. We're + # actually stricter here. The *point* is that some mailers tack plain + # text on to the end of base64-encoded text sections. + + # Match a line of base64, up to & including the trailing newline. + # We allow for optional leading and trailing whitespace, and don't care + # about line length, but other than that are strict. Group 1 is non-empty + # after a match iff the last significant char on the line is '='; in that + # case, it must be the last line of the base64 section. + base64_re = re.compile(r""" + [ \t]* + [a-zA-Z0-9+/]* + (=*) + [ \t]* + \r? + \n + """, re.VERBOSE) + + def try_to_repair_damaged_base64(text): + import binascii + i = 0 + while True: + # text[:i] looks like base64. Does the line starting at i also? + m = base64_re.match(text, i) + if not m: + break + i = m.end() + if m.group(1): + # This line has a trailing '=' -- the base64 part is done. + break + base64text = '' + if i: + base64 = text[:i] + try: + base64text = binascii.a2b_base64(base64) + except: + # There's no point in tokenizing raw base64 gibberish. + pass + return base64text + text[i:] + def breakdown_host(host): parts = host.split('.') *************** *** 1154,1157 **** --- 1196,1201 ---- yield "control: couldn't decode" text = part.get_payload(decode=False) + if text is not None: + text = try_to_repair_damaged_base64(text) if text is None: From richiehindle@users.sourceforge.net Tue Oct 29 21:02:46 2002 From: richiehindle@users.sourceforge.net (Richie Hindle) Date: Tue, 29 Oct 2002 13:02:46 -0800 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv7810 Modified Files: pop3proxy.py Log Message: Don't introduce a bogus \r character before the X-Hammie-Disposition header. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** pop3proxy.py 30 Sep 2002 20:13:39 -0000 1.6 --- pop3proxy.py 29 Oct 2002 21:02:40 -0000 1.7 *************** *** 350,354 **** disposition = "No " headers, body = re.split(r'\n\r?\n', response, 1) ! headers = headers + "\r\n" + HEADER_FORMAT % disposition + "\r\n" return headers + body else: --- 350,354 ---- disposition = "No " headers, body = re.split(r'\n\r?\n', response, 1) ! headers = headers + "\n" + HEADER_FORMAT % disposition + "\r\n" return headers + body else: From tim_one@users.sourceforge.net Thu Oct 31 06:42:51 2002 From: tim_one@users.sourceforge.net (Tim Peters) Date: Wed, 30 Oct 2002 22:42:51 -0800 Subject: [Spambayes-checkins] spambayes tokenizer.py,1.57,1.58 Message-ID: Update of /cvsroot/spambayes/spambayes In directory usw-pr-cvs1:/tmp/cvs-serv30231 Modified Files: tokenizer.py Log Message: A new mini-phase of body tokenization scours HTML for common virus clues, variations of