From montanaro at users.sourceforge.net Tue Nov 25 02:28:43 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:28:43 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3179] trunk/spambayes/spambayes/CoreUI.py Message-ID: Revision: 3179 http://spambayes.svn.sourceforge.net/spambayes/?rev=3179&view=rev Author: montanaro Date: 2008-11-25 01:28:43 +0000 (Tue, 25 Nov 2008) Log Message: ----------- abs import (pylint) Modified Paths: -------------- trunk/spambayes/spambayes/CoreUI.py Modified: trunk/spambayes/spambayes/CoreUI.py =================================================================== --- trunk/spambayes/spambayes/CoreUI.py 2008-06-24 22:52:54 UTC (rev 3178) +++ trunk/spambayes/spambayes/CoreUI.py 2008-11-25 01:28:43 UTC (rev 3179) @@ -70,7 +70,7 @@ except ImportError: from spambayes.compatsets import Set -import UserInterface +from spambayes import UserInterface from spambayes.Options import options, load_options, get_pathname_option, _ ## no i18n yet... ##from spambayes import i18n This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:34:26 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:34:26 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3180] trunk/spambayes/spambayes/safepickle.py Message-ID: Revision: 3180 http://spambayes.svn.sourceforge.net/spambayes/?rev=3180&view=rev Author: montanaro Date: 2008-11-25 01:34:26 +0000 (Tue, 25 Nov 2008) Log Message: ----------- new module implementing locking pickle reads and writes. Added Paths: ----------- trunk/spambayes/spambayes/safepickle.py Added: trunk/spambayes/spambayes/safepickle.py =================================================================== --- trunk/spambayes/spambayes/safepickle.py (rev 0) +++ trunk/spambayes/spambayes/safepickle.py 2008-11-25 01:34:26 UTC (rev 3180) @@ -0,0 +1,55 @@ +"""Lock pickle files for reading and writing.""" + +import sys +import os +import cPickle as pickle + +import lockfile + +from spambayes.Options import options + +def pickle_read(filename): + """Read pickle file contents with a lock.""" + lock = lockfile.FileLock(filename) + lock.acquire(timeout=20) + try: + return pickle.load(open(filename, 'rb')) + finally: + lock.release() + +def pickle_write(filename, value, protocol=0): + '''Store value as a pickle without creating corruption''' + + lock = lockfile.FileLock(filename) + lock.acquire(timeout=20) + + try: + # Be as defensive as possible. Always keep a safe copy. + tmp = filename + '.tmp' + fp = None + try: + fp = open(tmp, 'wb') + pickle.dump(value, fp, protocol) + fp.close() + except IOError, e: + if options["globals", "verbose"]: + print >> sys.stderr, 'Failed update: ' + str(e) + if fp is not None: + os.remove(tmp) + raise + try: + # With *nix we can just rename, and (as long as permissions + # are correct) the old file will vanish. With win32, this + # won't work - the Python help says that there may not be + # a way to do an atomic replace, so we rename the old one, + # put the new one there, and then delete the old one. If + # something goes wrong, there is at least a copy of the old + # one. + os.rename(tmp, filename) + except OSError: + os.rename(filename, filename + '.bak') + os.rename(tmp, filename) + os.remove(filename + '.bak') + finally: + lock.release() + Property changes on: trunk/spambayes/spambayes/safepickle.py ___________________________________________________________________ Added: svn:eol-style + native This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:46:17 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:46:17 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3181] trunk/spambayes/spambayes/smtpproxy.py Message-ID: Revision: 3181 http://spambayes.svn.sourceforge.net/spambayes/?rev=3181&view=rev Author: montanaro Date: 2008-11-25 01:46:17 +0000 (Tue, 25 Nov 2008) Log Message: ----------- stop using string module. Modified Paths: -------------- trunk/spambayes/spambayes/smtpproxy.py Modified: trunk/spambayes/spambayes/smtpproxy.py =================================================================== --- trunk/spambayes/spambayes/smtpproxy.py 2008-11-25 01:34:26 UTC (rev 3180) +++ trunk/spambayes/spambayes/smtpproxy.py 2008-11-25 01:46:17 UTC (rev 3181) @@ -119,7 +119,6 @@ is set, and if view all headers is true. """ -import string import re import socket import sys @@ -317,8 +316,8 @@ getting FROM: addresses. """ if '<' in address: - start = string.index(address, '<') + 1 - end = string.index(address, '>') + start = address.index('<') + 1 + end = address.index('>') return address[start:end] else: return address @@ -368,7 +367,7 @@ rv = "%s:%s" % (command, ' '.join(args)) return rv - def onUnknown(self, command, args): + def onUnknown(self, _command, _args): """Default handler.""" return self.request This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:47:48 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:47:48 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3182] trunk/spambayes/spambayes/XMLRPCPlugin.py Message-ID: Revision: 3182 http://spambayes.svn.sourceforge.net/spambayes/?rev=3182&view=rev Author: montanaro Date: 2008-11-25 01:47:48 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/XMLRPCPlugin.py Modified: trunk/spambayes/spambayes/XMLRPCPlugin.py =================================================================== --- trunk/spambayes/spambayes/XMLRPCPlugin.py 2008-11-25 01:46:17 UTC (rev 3181) +++ trunk/spambayes/spambayes/XMLRPCPlugin.py 2008-11-25 01:47:48 UTC (rev 3182) @@ -45,6 +45,7 @@ import threading import xmlrpclib +import time from email import Message, message_from_string from SimpleXMLRPCServer import SimpleXMLRPCServer @@ -52,6 +53,8 @@ from spambayes.Options import _, options from spambayes.tokenizer import tokenize import spambayes.message +from spambayes import storage +from spambayes import FileCorpus class XMLRPCUI(PluginUI): plugin_map = ( @@ -82,9 +85,9 @@ raise xmlrpclib.Fault(404, '"%s" is not supported' % method) def train(self, form_dict, extra_tokens, attachments, is_spam=True): - newdict={} + newdict = {} for (i, k) in form_dict.items(): - if type(k)==unicode: + if isinstance(k, unicode): k = k.encode("utf-8") newdict[i] = k mime_message = form_to_mime(newdict, extra_tokens, attachments) @@ -104,7 +107,6 @@ msg_text = msg_text.encode("utf-8") msg = message_from_string(msg_text, _class=spambayes.message.SBHeaderMessage) - tokens = tokenize(msg) if is_spam: desired_corpus = "spamCorpus" else: @@ -117,7 +119,7 @@ setattr(self, desired_corpus, corpus) self.msg_name_func = self.state.getNewMessageName else: - if isSpam: + if is_spam: fn = storage.get_pathname_option("Storage", "spam_cache") else: @@ -130,7 +132,8 @@ factory = FileCorpus.FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 corpus = FileCorpus.ExpiryFileCorpus(age, factory, fn, - '[0123456789\-]*', cacheSize=20) + '[0123456789\-]*', + cacheSize=20) setattr(self, desired_corpus, corpus) class UniqueNamer(object): count = -1 @@ -159,9 +162,9 @@ def score(self, form_dict, extra_tokens, attachments): """Score a dictionary + extra tokens.""" - newdict={} + newdict = {} for (i, k) in form_dict.items(): - if isinstance(k,unicode): + if isinstance(k, unicode): k = k.encode("utf-8") newdict[i] = k mime_message = form_to_mime(newdict, extra_tokens, attachments) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:48:34 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:48:34 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3183] trunk/spambayes/spambayes/hammie.py Message-ID: Revision: 3183 http://spambayes.svn.sourceforge.net/spambayes/?rev=3183&view=rev Author: montanaro Date: 2008-11-25 01:48:34 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/hammie.py Modified: trunk/spambayes/spambayes/hammie.py =================================================================== --- trunk/spambayes/spambayes/hammie.py 2008-11-25 01:47:48 UTC (rev 3182) +++ trunk/spambayes/spambayes/hammie.py 2008-11-25 01:48:34 UTC (rev 3183) @@ -1,5 +1,6 @@ #! /usr/bin/env python +import math from spambayes import mboxutils from spambayes import storage @@ -122,14 +123,13 @@ basic_disp = disp disp += "; %.*f" % (options["Headers", "header_score_digits"], prob) if options["Headers", "header_score_logarithm"]: - if prob<=0.005 and prob>0.0: + if prob <= 0.005 and prob > 0.0: import math - x=-math.log10(prob) - disp += " (%d)"%x - if prob>=0.995 and prob<1.0: - import math - x=-math.log10(1.0-prob) - disp += " (%d)"%x + x = -math.log10(prob) + disp += " (%d)" % x + if prob >= 0.995 and prob < 1.0: + x = -math.log10(1.0-prob) + disp += " (%d)" % x del msg[header] msg.add_header(header, disp) @@ -151,7 +151,7 @@ def filter(self, msg, header=None, spam_cutoff=None, ham_cutoff=None, debugheader=None, debug=None, train=None): - prob, result = self.score_and_filter( + _prob, result = self.score_and_filter( msg, header, spam_cutoff, ham_cutoff, debugheader, debug, train) return result @@ -281,6 +281,6 @@ if __name__ == "__main__": # Everybody's used to running hammie.py. Why mess with success? ;) - import hammiebulk + from spambayes import hammiebulk hammiebulk.main() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:49:56 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:49:56 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3184] trunk/spambayes/spambayes/dbmstorage.py Message-ID: Revision: 3184 http://spambayes.svn.sourceforge.net/spambayes/?rev=3184&view=rev Author: montanaro Date: 2008-11-25 01:49:56 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/dbmstorage.py Modified: trunk/spambayes/spambayes/dbmstorage.py =================================================================== --- trunk/spambayes/spambayes/dbmstorage.py 2008-11-25 01:48:34 UTC (rev 3183) +++ trunk/spambayes/spambayes/dbmstorage.py 2008-11-25 01:49:56 UTC (rev 3184) @@ -38,7 +38,7 @@ # Note that Python 2.3 and later ship with the new bsddb interface # as the default bsddb module - so 2.3 can use the old name safely. funcs = [open_db3hash, open_gdbm] - if sys.version_info >= (2,3): + if sys.version_info >= (2, 3): funcs.insert(0, open_dbhash) else: funcs = [open_db3hash, open_dbhash, open_gdbm, open_db185hash] @@ -65,8 +65,9 @@ dbm_type = whichdb.whichdb(db_name) # if we are using Windows and Python < 2.3, then we need to use # db3hash, not dbhash. - if sys.platform == "win32" and sys.version_info < (2,3) and \ - dbm_type == "dbhash": + if (sys.platform == "win32" and + sys.version_info < (2, 3) and + dbm_type == "dbhash"): dbm_type = "db3hash" else: # fresh file or overridden - open with what the user specified This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:50:45 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:50:45 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3185] trunk/spambayes/spambayes/ProxyUI.py Message-ID: Revision: 3185 http://spambayes.svn.sourceforge.net/spambayes/?rev=3185&view=rev Author: montanaro Date: 2008-11-25 01:50:44 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/ProxyUI.py Modified: trunk/spambayes/spambayes/ProxyUI.py =================================================================== --- trunk/spambayes/spambayes/ProxyUI.py 2008-11-25 01:49:56 UTC (rev 3184) +++ trunk/spambayes/spambayes/ProxyUI.py 2008-11-25 01:50:44 UTC (rev 3185) @@ -70,7 +70,7 @@ except ImportError: from spambayes.compatsets import Set -import UserInterface +from spambayes import UserInterface from spambayes.Options import options, _ state = None This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:51:55 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:51:55 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3186] trunk/spambayes/spambayes/cdb.py Message-ID: Revision: 3186 http://spambayes.svn.sourceforge.net/spambayes/?rev=3186&view=rev Author: montanaro Date: 2008-11-25 01:51:55 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/cdb.py Modified: trunk/spambayes/spambayes/cdb.py =================================================================== --- trunk/spambayes/spambayes/cdb.py 2008-11-25 01:50:44 UTC (rev 3185) +++ trunk/spambayes/spambayes/cdb.py 2008-11-25 01:51:55 UTC (rev 3186) @@ -66,10 +66,10 @@ return self.__iter__() def iterkeys(self): - return self.__iter__(lambda k,v: k) + return self.__iter__(lambda k, v: k) def itervalues(self): - return self.__iter__(lambda k,v: v) + return self.__iter__(lambda k, v: v) def items(self): ret = [] @@ -150,7 +150,7 @@ def cdb_dump(infile): """dump a database in djb's cdbdump format""" db = Cdb(infile) - for key,value in db.iteritems(): + for key, value in db.iteritems(): print "+%d,%d:%s->%s" % (len(key), len(value), key, value) print This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:53:23 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:53:23 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3187] trunk/spambayes/spambayes/classifier.py Message-ID: Revision: 3187 http://spambayes.svn.sourceforge.net/spambayes/?rev=3187&view=rev Author: montanaro Date: 2008-11-25 01:53:22 +0000 (Tue, 25 Nov 2008) Log Message: ----------- Use safepickle ops. pylint nits. Modified Paths: -------------- trunk/spambayes/spambayes/classifier.py Modified: trunk/spambayes/spambayes/classifier.py =================================================================== --- trunk/spambayes/spambayes/classifier.py 2008-11-25 01:51:55 UTC (rev 3186) +++ trunk/spambayes/spambayes/classifier.py 2008-11-25 01:53:22 UTC (rev 3187) @@ -58,7 +58,6 @@ import os import sys import socket -import pickle import urllib2 from email import message_from_string @@ -78,6 +77,7 @@ from spambayes.Options import options from spambayes.chi2 import chi2Q +from spambayes.safepickle import pickle_read, pickle_write try: True, False @@ -226,7 +226,7 @@ prob = 0.5 if evidence: - clues = [(w, p) for p, w, r in clues] + clues = [(w, p) for p, w, _r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) clues.insert(0, ('*S*', S)) clues.insert(0, ('*H*', H)) @@ -250,7 +250,7 @@ if len(clues) < options["Classifier", "max_discriminators"] and \ prob > h_cut and prob < s_cut and slurp_wordstream: slurp_tokens = list(self._generate_slurp()) - slurp_tokens.extend([w for (w,p) in clues]) + slurp_tokens.extend([w for (w, _p) in clues]) sprob, sclues = self.chi2_spamprob(slurp_tokens, True) if sprob < h_cut or sprob > s_cut: prob = sprob @@ -602,7 +602,7 @@ if not os.path.exists(dir): # Create the directory. if options["globals", "verbose"]: - print >>sys.stderr, "Creating URL cache directory" + print >> sys.stderr, "Creating URL cache directory" os.makedirs(dir) self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), @@ -614,18 +614,16 @@ self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck") self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck") if os.path.exists(self.bad_url_cache_name): - b_file = file(self.bad_url_cache_name, "r") try: - self.bad_urls = pickle.load(b_file) - except IOError, ValueError: + self.bad_urls = pickle_read(self.bad_url_cache_name) + except (IOError, ValueError): # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: - print >>sys.stderr, "Bad URL pickle, using new." + print >> sys.stderr, "Bad URL pickle, using new." self.bad_urls = {"url:non_resolving": (), "url:non_html": (), "url:unknown_error": ()} - b_file.close() else: if options["globals", "verbose"]: print "URL caches don't exist: creating" @@ -633,16 +631,14 @@ "url:non_html": (), "url:unknown_error": ()} if os.path.exists(self.http_error_cache_name): - h_file = file(self.http_error_cache_name, "r") try: - self.http_error_urls = pickle.load(h_file) + self.http_error_urls = pickle_read(self.http_error_cache_name) except IOError, ValueError: # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: - print >>sys.stderr, "Bad HHTP error pickle, using new." + print >> sys.stderr, "Bad HHTP error pickle, using new." self.http_error_urls = {} - h_file.close() else: self.http_error_urls = {} @@ -652,8 +648,7 @@ # XXX becomes valid, for example). for name, data in [(self.bad_url_cache_name, self.bad_urls), (self.http_error_cache_name, self.http_error_urls),]: - from storage import safe_pickle - safe_pickle(name, data) + pickle_write(name, data) def slurp(self, proto, url): # We generate these tokens: @@ -694,7 +689,7 @@ else: port = mo.group(3) try: - not_used = socket.getaddrinfo(domain, port) + _unused = socket.getaddrinfo(domain, port) except socket.error: self.bad_urls["url:non_resolving"] += (url,) return ["url:non_resolving"] @@ -724,7 +719,7 @@ pass try: if options["globals", "verbose"]: - print >>sys.stderr, "Slurping", url + print >> sys.stderr, "Slurping", url f = urllib2.urlopen("%s://%s" % (proto, url)) except (urllib2.URLError, socket.error), details: mo = HTTP_ERROR_RE.match(str(details)) @@ -792,7 +787,7 @@ # would become http://massey.ac.nz and http://id.example.com # would become http://example.com url += '/' - domain, garbage = url.split('/', 1) + domain = url.split('/', 1)[0] parts = domain.split('.') if len(parts) > 2: base_domain = parts[-2] + '.' + parts[-1] This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:53:59 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:53:59 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3188] trunk/spambayes/spambayes/UserInterface.py Message-ID: Revision: 3188 http://spambayes.svn.sourceforge.net/spambayes/?rev=3188&view=rev Author: montanaro Date: 2008-11-25 01:53:58 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/UserInterface.py Modified: trunk/spambayes/spambayes/UserInterface.py =================================================================== --- trunk/spambayes/spambayes/UserInterface.py 2008-11-25 01:53:22 UTC (rev 3187) +++ trunk/spambayes/spambayes/UserInterface.py 2008-11-25 01:53:58 UTC (rev 3188) @@ -81,15 +81,16 @@ import StringIO from email.Iterators import typed_subpart_iterator -import oe_mailbox +from spambayes import oe_mailbox -import PyMeldLite -import Dibbler -import tokenizer +from spambayes import PyMeldLite +from spambayes import Dibbler +from spambayes import tokenizer from spambayes import Version from spambayes import storage from spambayes import FileCorpus -from Options import options, optionsPathname, defaults, OptionsClass, _ +from spambayes.Options import options, optionsPathname, defaults, \ + OptionsClass, _ IMAGES = ('helmet', 'status', 'config', 'help', 'message', 'train', 'classify', 'query') @@ -994,7 +995,7 @@ for section, option in parm_map: if option is not None: if not options.no_restore(section, option): - options.set(section, option, d.get(section,option)) + options.set(section, option, d.get(section, option)) options.update_file(optionsPathname) @@ -1058,7 +1059,7 @@ remote_servers = options["pop3proxy", "remote_servers"] if remote_servers: domain_guess = remote_servers[0] - for pre in ["pop.", "pop3.", "mail.",]: + for pre in ["pop.", "pop3.", "mail."]: if domain_guess.startswith(pre): domain_guess = domain_guess[len(pre):] else: @@ -1074,7 +1075,7 @@ else: if hasattr(sys, "frozen"): temp_dir = win32api.GetTempPath() - for name in ["SpamBayesService", "SpamBayesServer",]: + for name in ["SpamBayesService", "SpamBayesServer"]: for i in xrange(3): pn = os.path.join(temp_dir, "%s%d.log" % (name, (i+1))) @@ -1212,19 +1213,16 @@ try: from textwrap import fill except ImportError: + # No textwrap module, so do the same stuff (more-or-less) + # ourselves. + def fill(text, width): + if len(text) <= width: + return text + wordsep_re = re.compile(r'(-*\w{2,}-(?=\w{2,})|' # hyphenated words + r'(?<=\S)-{2,}(?=\w))') # em-dash + chunks = wordsep_re.split(text) + chunks = filter(None, chunks) pass - else: - return "\n".join([fill(paragraph, width) \ - for paragraph in text.split('\n')]) - # No textwrap module, so do the same stuff (more-or-less) ourselves. - def fill(text, width): - if len(text) <= width: - return text - wordsep_re = re.compile(r'(-*\w{2,}-(?=\w{2,})|' # hyphenated words - r'(?<=\S)-{2,}(?=\w))') # em-dash - chunks = wordsep_re.split(text) - chunks = filter(None, chunks) - return '\n'.join(self._wrap_chunks(chunks, width)) return "\n".join([fill(paragraph, width) \ for paragraph in text.split('\n')]) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:54:19 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:54:19 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3189] trunk/spambayes/spambayes/Tester.py Message-ID: Revision: 3189 http://spambayes.svn.sourceforge.net/spambayes/?rev=3189&view=rev Author: montanaro Date: 2008-11-25 01:54:19 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/Tester.py Modified: trunk/spambayes/spambayes/Tester.py =================================================================== --- trunk/spambayes/spambayes/Tester.py 2008-11-25 01:53:58 UTC (rev 3188) +++ trunk/spambayes/spambayes/Tester.py 2008-11-25 01:54:19 UTC (rev 3189) @@ -42,10 +42,10 @@ # The number of test instances correctly and incorrectly classified. self.nham_right = 0 self.nham_wrong = 0 - self.nham_unsure = 0; + self.nham_unsure = 0 self.nspam_right = 0 self.nspam_wrong = 0 - self.nspam_unsure = 0; + self.nspam_unsure = 0 # Lists of bad predictions. self.ham_wrong_examples = [] # False positives: ham called spam. @@ -194,9 +194,6 @@ __test__ = {'easy': _easy_test} -def _test(): - import doctest, Tester - doctest.testmod(Tester) - if __name__ == '__main__': - _test() + import doctest + doctest.testmod() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:55:21 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:55:21 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3190] trunk/spambayes/spambayes/TestToolsUI.py Message-ID: Revision: 3190 http://spambayes.svn.sourceforge.net/spambayes/?rev=3190&view=rev Author: montanaro Date: 2008-11-25 01:55:20 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/TestToolsUI.py Modified: trunk/spambayes/spambayes/TestToolsUI.py =================================================================== --- trunk/spambayes/spambayes/TestToolsUI.py 2008-11-25 01:54:19 UTC (rev 3189) +++ trunk/spambayes/spambayes/TestToolsUI.py 2008-11-25 01:55:20 UTC (rev 3190) @@ -38,8 +38,8 @@ import random import StringIO -import ProxyUI -import oe_mailbox +from spambayes import ProxyUI +from spambayes import oe_mailbox from spambayes import msgs from spambayes import TestDriver from spambayes import OptionsClass @@ -60,9 +60,9 @@ # Dynamically add any current experimental/deprecated options. for opt in options.options(True): - sect, opt = opt[1:].split(']', 1) + _sect, _opt = opt[1:].split(']', 1) if opt[:2].lower() == "x-": - testtools_ini_map += ((sect, opt),) + testtools_ini_map += ((_sect, _opt),) class TestToolsUserInterface(ProxyUI.ProxyUserInterface): """Serves the HTML user interface for the test tools.""" @@ -93,8 +93,8 @@ ('TestToolsUI', 'source'), ('TestToolsUI', 'n'),) - option_choice = self._buildConfigPageBody(\ - configTable, testtools_ini_map) + option_choice = self._buildConfigPageBody(configTable, + testtools_ini_map) option_choice.action_page.action = "cvresults" option_choice.introduction = "Select the options for your test " \ "(these will be run against the " \ @@ -529,15 +529,15 @@ set_num, nsets = portion.split('/') - all = os.listdir(directory) - random.seed(hash(max(all)) ^ msgs.SEED) - random.shuffle(all) + files = os.listdir(directory) + random.seed(hash(max(files)) ^ msgs.SEED) + random.shuffle(files) - set_size = len(all) // int(nsets) + set_size = len(files) // int(nsets) set_num = int(set_num) - set = all[set_num*set_size:((set_num+1)*set_size)-1] - set.sort() - for fname in set: + fileset = files[set_num*set_size:((set_num+1)*set_size)-1] + fileset.sort() + for fname in fileset: yield msgs.Msg(directory, fname) class HamCacheStream(CacheStream): This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:56:15 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:56:15 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3191] trunk/spambayes/spambayes/TestDriver.py Message-ID: Revision: 3191 http://spambayes.svn.sourceforge.net/spambayes/?rev=3191&view=rev Author: montanaro Date: 2008-11-25 01:56:15 +0000 (Tue, 25 Nov 2008) Log Message: ----------- Use safepickle funcs. pylint nits. Modified Paths: -------------- trunk/spambayes/spambayes/TestDriver.py Modified: trunk/spambayes/spambayes/TestDriver.py =================================================================== --- trunk/spambayes/spambayes/TestDriver.py 2008-11-25 01:55:20 UTC (rev 3190) +++ trunk/spambayes/spambayes/TestDriver.py 2008-11-25 01:56:15 UTC (rev 3191) @@ -33,17 +33,11 @@ except ImportError: from spambayes.compatsets import Set -import cPickle as pickle - -try: - from heapq import heapreplace -except ImportError: - from spambayes.compatheapq import heapreplace - from spambayes.Options import options from spambayes import Tester from spambayes import classifier from spambayes.Histogram import Hist +from spambayes.safepickle import pickle_write try: True, False @@ -134,7 +128,7 @@ num_fp*1e2 / ham.n, num_fn*1e2 / spam.n, (num_unh + num_uns)*1e2 / (ham.n + spam.n)) - return float(bests[0][0])/n,float(bests[0][1])/n + return float(bests[0][0])/n, float(bests[0][1])/n def printmsg(msg, prob, clues): print msg.tag @@ -159,7 +153,7 @@ self.ntimes_finishtest_called = 0 self.new_classifier() from spambayes import CostCounter - self.cc=CostCounter.default() + self.cc = CostCounter.default() def new_classifier(self): """Create and use a new, virgin classifier.""" @@ -200,15 +194,13 @@ fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"], self.ntimes_finishtest_called) print " saving pickle to", fname - fp = file(fname, 'wb') - pickle.dump(self.classifier, fp, 1) - fp.close() + pickle_write(fname, self.classifier, 1) def alldone(self): if options["TestDriver", "show_histograms"]: - besthamcut,bestspamcut = printhist("all runs:", - self.global_ham_hist, - self.global_spam_hist) + besthamcut, bestspamcut = printhist("all runs:", + self.global_ham_hist, + self.global_spam_hist) else: besthamcut = options["Categorization", "ham_cutoff"] bestspamcut = options["Categorization", "spam_cutoff"] @@ -239,10 +231,8 @@ ('spam', self.global_spam_hist)): fname = "%s_%shist.pik" % (options["TestDriver", "pickle_basename"], f) - print " saving %s histogram pickle to %s" %(f, fname) - fp = file(fname, 'wb') - pickle.dump(h, fp, 1) - fp.close() + print " saving %s histogram pickle to %s" % (f, fname) + pickle_write(fname, h, 1) def test(self, ham, spam): c = self.classifier This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:57:18 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:57:18 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3192] trunk/spambayes/spambayes/mboxutils.py Message-ID: Revision: 3192 http://spambayes.svn.sourceforge.net/spambayes/?rev=3192&view=rev Author: montanaro Date: 2008-11-25 01:57:18 +0000 (Tue, 25 Nov 2008) Log Message: ----------- better doctest running Modified Paths: -------------- trunk/spambayes/spambayes/mboxutils.py Modified: trunk/spambayes/spambayes/mboxutils.py =================================================================== --- trunk/spambayes/spambayes/mboxutils.py 2008-11-25 01:56:15 UTC (rev 3191) +++ trunk/spambayes/spambayes/mboxutils.py 2008-11-25 01:57:18 UTC (rev 3192) @@ -131,7 +131,7 @@ message_db = message.Message().message_info_db stats = Stats.Stats(options, message_db) - mboxes = [ IMAPFolder(n,session,stats) for n in names ] + mboxes = [IMAPFolder(n, session, stats) for n in names] if len(mboxes) == 1: return full_messages(mboxes[0]) @@ -268,9 +268,6 @@ text = "" return text -def _test(): - import doctest, mboxutils - return doctest.testmod(mboxutils) - if __name__ == "__main__": - _test() + import doctest + doctest.testmod() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:57:51 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:57:51 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3193] trunk/spambayes/spambayes/Stats.py Message-ID: Revision: 3193 http://spambayes.svn.sourceforge.net/spambayes/?rev=3193&view=rev Author: montanaro Date: 2008-11-25 01:57:51 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint Modified Paths: -------------- trunk/spambayes/spambayes/Stats.py Modified: trunk/spambayes/spambayes/Stats.py =================================================================== --- trunk/spambayes/spambayes/Stats.py 2008-11-25 01:57:18 UTC (rev 3192) +++ trunk/spambayes/spambayes/Stats.py 2008-11-25 01:57:51 UTC (rev 3193) @@ -403,6 +403,6 @@ return chunks -if __name__=='__main__': +if __name__ == '__main__': s = Stats() print "\n".join(s.GetStats()) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 02:59:43 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 01:59:43 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3194] trunk/spambayes/spambayes/Dibbler.py Message-ID: Revision: 3194 http://spambayes.svn.sourceforge.net/spambayes/?rev=3194&view=rev Author: montanaro Date: 2008-11-25 01:59:43 +0000 (Tue, 25 Nov 2008) Log Message: ----------- Use hashlib.md5 if available. pylint nits. errors go to stderr. Modified Paths: -------------- trunk/spambayes/spambayes/Dibbler.py Modified: trunk/spambayes/spambayes/Dibbler.py =================================================================== --- trunk/spambayes/spambayes/Dibbler.py 2008-11-25 01:57:51 UTC (rev 3193) +++ trunk/spambayes/spambayes/Dibbler.py 2008-11-25 01:59:43 UTC (rev 3194) @@ -169,10 +169,14 @@ except ImportError: import StringIO -import sys, re, time, traceback, md5, base64 +import sys, re, time, traceback, base64 import socket, asyncore, asynchat, cgi, urlparse, webbrowser try: + from hashlib import md5 +except ImportError: + from md5 import new as md5 +try: True, False except NameError: # Maintain compatibility with Python 2.2 @@ -283,7 +287,11 @@ self.set_reuse_addr() if type(port) != type(()): port = ('', port) - self.bind(port) + try: + self.bind(port) + except socket.error: + print >> sys.stderr, "port", port, "in use" + raise self.listen(5) def handle_accept(self): @@ -456,7 +464,7 @@ elif authenticationMode == HTTPServer.DIGEST_AUTHENTICATION: authResult = self._digestAuthentication(login, method) else: - print >>sys.stdout, "Unknown mode: %s" % authenticationMode + print >> sys.stderr, "Unknown mode: %s" % authenticationMode if not authResult: self.writeUnauthorizedAccess(serverAuthMode) @@ -625,7 +633,8 @@ """Check if the specified nonce is still valid. A nonce is invalid when its time converted value is lower than current time.""" padAmount = len(nonce) % 4 - if padAmount > 0: padAmount = 4 - padAmount + if padAmount > 0: + padAmount = 4 - padAmount nonce += '=' * (len(nonce) + padAmount) decoded = base64.decodestring(nonce) @@ -650,9 +659,9 @@ # The following computations are based upon RFC 2617. A1 = "%s:%s:%s" % (userName, self._server.getRealm(), password) - HA1 = md5.new(A1).hexdigest() + HA1 = md5(A1).hexdigest() A2 = "%s:%s" % (method, stripQuotes(options["uri"])) - HA2 = md5.new(A2).hexdigest() + HA2 = md5(A2).hexdigest() unhashedDigest = "" if options.has_key("qop"): @@ -669,7 +678,7 @@ stripQuotes(options["qop"]), HA2) else: unhashedDigest = "%s:%s:%s" % (HA1, nonce, HA2) - hashedDigest = md5.new(unhashedDigest).hexdigest() + hashedDigest = md5(unhashedDigest).hexdigest() return (stripQuotes(options["response"]) == hashedDigest and self._isValidNonce(nonce)) @@ -735,8 +744,8 @@ def runTestServer(readyEvent=None): """Runs the calendar server example, with an added `/shutdown` URL.""" - import Dibbler, calendar - class Calendar(Dibbler.HTTPPlugin): + import calendar + class Calendar(HTTPPlugin): _form = '''

Calendar Server

Year: @@ -757,12 +766,12 @@ self.close() sys.exit() - httpServer = Dibbler.HTTPServer(8888) + httpServer = HTTPServer(8888) httpServer.register(Calendar()) if readyEvent: # Tell the self-test code that the test server is up and running. readyEvent.set() - Dibbler.run(launchBrowser=True) + run(launchBrowser=True) def test(): """Run a self-test.""" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:00:10 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:00:10 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3195] trunk/spambayes/spambayes/CostCounter.py Message-ID: Revision: 3195 http://spambayes.svn.sourceforge.net/spambayes/?rev=3195&view=rev Author: montanaro Date: 2008-11-25 02:00:09 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/CostCounter.py Modified: trunk/spambayes/spambayes/CostCounter.py =================================================================== --- trunk/spambayes/spambayes/CostCounter.py 2008-11-25 01:59:43 UTC (rev 3194) +++ trunk/spambayes/spambayes/CostCounter.py 2008-11-25 02:00:09 UTC (rev 3195) @@ -16,7 +16,7 @@ return "%s: $%.4f" % (self.name, self.total) class CompositeCostCounter: - def __init__(self,cclist): + def __init__(self, cclist): self.clients = cclist def spam(self, scr): @@ -34,10 +34,10 @@ return '\n'.join(s) class DelayedCostCounter(CompositeCostCounter): - def __init__(self,cclist): - CompositeCostCounter.__init__(self,cclist) - self.spamscr=[] - self.hamscr=[] + def __init__(self, cclist): + CompositeCostCounter.__init__(self, cclist) + self.spamscr = [] + self.hamscr = [] def spam(self, scr): self.spamscr.append(scr) @@ -47,10 +47,10 @@ def __str__(self): for scr in self.spamscr: - CompositeCostCounter.spam(self,scr) + CompositeCostCounter.spam(self, scr) for scr in self.hamscr: - CompositeCostCounter.ham(self,scr) - s=[] + CompositeCostCounter.ham(self, scr) + s = [] for line in CompositeCostCounter.__str__(self).split('\n'): s.append('Delayed-'+line) return '\n'.join(s) @@ -108,7 +108,7 @@ zd(100.*(self._fp+self._fn),self._total), zd(100.*self._unsure,self._total))) -def zd(x,y): +def zd(x, y): if y > 0: return x / y else: @@ -180,12 +180,12 @@ Flex2CostCounter(), ]) -if __name__=="__main__": - cc=default() +if __name__ == "__main__": + cc = default() cc.ham(0) cc.spam(1) cc.ham(0.5) cc.spam(0.5) - options["Categorization", "spam_cutoff"]=0.7 - options["Categorization", "ham_cutoff"]=0.4 + options["Categorization", "spam_cutoff"] = 0.7 + options["Categorization", "ham_cutoff"] = 0.4 print cc This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:02:35 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:02:35 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3196] trunk/spambayes/spambayes/ImageStripper.py Message-ID: Revision: 3196 http://spambayes.svn.sourceforge.net/spambayes/?rev=3196&view=rev Author: montanaro Date: 2008-11-25 02:02:34 +0000 (Tue, 25 Nov 2008) Log Message: ----------- use hashlib.md5 if possible use safepickle functions raise SystemError if ocr engine croaks Modified Paths: -------------- trunk/spambayes/spambayes/ImageStripper.py Modified: trunk/spambayes/spambayes/ImageStripper.py =================================================================== --- trunk/spambayes/spambayes/ImageStripper.py 2008-11-25 02:00:09 UTC (rev 3195) +++ trunk/spambayes/spambayes/ImageStripper.py 2008-11-25 02:02:34 UTC (rev 3196) @@ -8,13 +8,11 @@ import os import tempfile import math -import time -import md5 -import atexit try: - import cPickle as pickle + from hashlib import md5 except ImportError: - import pickle + from md5 import new as md5 +import atexit try: import cStringIO as StringIO except ImportError: @@ -25,6 +23,8 @@ except ImportError: Image = None +from spambayes.safepickle import pickle_read, pickle_write + # The email mime object carrying the image data can have a special attribute # which indicates that a message had an image, but it was large (ie, larger # than the 'max_image_size' option.) This allows the provider of the email @@ -72,7 +72,7 @@ # C:/Program Files/SpamBayes/bin # so add that directory to the path and make sure we # look for a file ending in ".exe". - if sys.frozen=="dll": + if sys.frozen == "dll": import win32api sentinal = win32api.GetModuleFileName(sys.frozendllhandle) else: @@ -81,7 +81,7 @@ # So just use the short version. # For the sake of safety, in a binary build we *only* look in # our bin dir. - path=[win32api.GetShortPathName(os.path.dirname(sentinal))] + path = [win32api.GetShortPathName(os.path.dirname(sentinal))] else: # a source build - for testing, allow it in SB package dir. import spambayes @@ -255,8 +255,8 @@ ret = ocr.read() exit_code = ocr.close() if exit_code: - print "warning:", self.engine_name, "failed with exit code", exit_code - print "command line was:", repr(cmdline) + raise SystemError, ("%s failed with exit code %s" % + (self.engine_name, exit_code)) return ret class OCREngineOCRAD(OCRExecutableEngine): @@ -269,7 +269,7 @@ (self.program, scale, charset, pnmfile, os.path.devnull) class OCREngineGOCR(OCRExecutableEngine): - engine_name="gocr" + engine_name = "gocr" def get_command_line(self, pnmfile): return '%s "%s" 2>%s' % (self.program, pnmfile, os.path.devnull) @@ -302,7 +302,7 @@ def __init__(self, cachefile=""): self.cachefile = os.path.expanduser(cachefile) if os.path.exists(self.cachefile): - self.cache = pickle.load(open(self.cachefile)) + self.cache = pickle_read(self.cachefile) else: self.cache = {} self.misses = self.hits = 0 @@ -315,14 +315,20 @@ textbits = [] tokens = Set() for pnmfile in pnmfiles: - fhash = md5.new(open(pnmfile).read()).hexdigest() + preserve = False + fhash = md5(open(pnmfile).read()).hexdigest() if fhash in self.cache: self.hits += 1 ctext, ctokens = self.cache[fhash] else: self.misses += 1 if self.engine.program: - ctext = self.engine.extract_text(pnmfile).lower() + try: + ctext = self.engine.extract_text(pnmfile).lower() + except SystemError, msg: + print >> sys.stderr, msg + preserve = True + ctext = "" else: # We should not get here if no OCR is enabled. If it # is enabled and we have no program, its OK to spew lots @@ -345,13 +351,14 @@ self.cache[fhash] = (ctext, ctokens) textbits.append(ctext) tokens |= ctokens - os.unlink(pnmfile) + if not preserve: + os.unlink(pnmfile) return "\n".join(textbits), tokens def analyze(self, engine_name, parts): # check engine hasn't changed... - if self.engine is not None and self.engine.engine_name!=engine_name: + if self.engine is not None and self.engine.engine_name != engine_name: self.engine = None # check engine exists and is valid if self.engine is None: @@ -385,7 +392,7 @@ print >> sys.stderr, "%.2f%% hit rate" % \ (100 * self.hits / (self.hits + self.misses)), print >> sys.stderr - pickle.dump(self.cache, open(self.cachefile, "wb")) + pickle_write(self.cachefile, self.cache) _cachefile = options["Tokenizer", "crack_image_cache"] crack_images = ImageStripper(_cachefile).analyze This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:03:25 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:03:25 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3197] trunk/spambayes/spambayes/oe_mailbox.py Message-ID: Revision: 3197 http://spambayes.svn.sourceforge.net/spambayes/?rev=3197&view=rev Author: montanaro Date: 2008-11-25 02:03:24 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/oe_mailbox.py Modified: trunk/spambayes/spambayes/oe_mailbox.py =================================================================== --- trunk/spambayes/spambayes/oe_mailbox.py 2008-11-25 02:02:34 UTC (rev 3196) +++ trunk/spambayes/spambayes/oe_mailbox.py 2008-11-25 02:03:24 UTC (rev 3197) @@ -26,19 +26,20 @@ # Based on C++ work by Arne Schloh +import sys import binascii import os import re import struct -import mailbox -import msgs +import random +from time import * try: import cStringIO as StringIO except ImportError: import StringIO -import sys -from time import * +from spambayes import msgs + try: import win32api import win32con @@ -49,8 +50,7 @@ # Some functions will not work, but some will. win32api = win32con = win32gui = shell = shellcon = None -import oe_mailbox -import mboxutils +from spambayes import oe_mailbox ########################################################################### ## DBX FILE HEADER @@ -340,7 +340,8 @@ index = self.dbxBegin[dbxIndex] end = index for c in self.dbxBuffer[index:]: - if ord(c) == 0: break + if ord(c) == 0: + break end += 1 return self.dbxBuffer[index:end] @@ -693,19 +694,18 @@ ########################################################################### def test(): - import sys import getopt try: opts, args = getopt.getopt(sys.argv[1:], 'hp') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() print_message = False for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-p': print_message = True This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:05:35 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:05:35 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3198] trunk/spambayes/spambayes/dnscache.py Message-ID: Revision: 3198 http://spambayes.svn.sourceforge.net/spambayes/?rev=3198&view=rev Author: montanaro Date: 2008-11-25 02:05:34 +0000 (Tue, 25 Nov 2008) Log Message: ----------- use safepickle functions pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/dnscache.py Modified: trunk/spambayes/spambayes/dnscache.py =================================================================== --- trunk/spambayes/spambayes/dnscache.py 2008-11-25 02:03:24 UTC (rev 3197) +++ trunk/spambayes/spambayes/dnscache.py 2008-11-25 02:05:34 UTC (rev 3198) @@ -6,7 +6,8 @@ # Version 0.1 2004 06 27 # Version 0.11 2004 07 06 Fixed zero division error in __del__ -import DNS # From http://sourceforge.net/projects/pydns/ +# From http://sourceforge.net/projects/pydns/ +import DNS import sys import os @@ -14,31 +15,28 @@ import time import types import socket -try: - import cPickle as pickle -except ImportError: - import pickle from spambayes.Options import options +from spambayes.safepickle import pickle_read, pickle_write -kCheckForPruneEvery=20 -kMaxTTL=60 * 60 * 24 * 7 # One week +kCheckForPruneEvery = 20 +kMaxTTL = 60 * 60 * 24 * 7 # One week # Some servers always return a TTL of zero. We'll hold onto data a bit # longer. -kMinTTL=24 * 60 * 60 * 1 # one day -kPruneThreshold=5000 # May go over slightly; numbers chosen at random -kPruneDownTo=2500 +kMinTTL = 24 * 60 * 60 * 1 # one day +kPruneThreshold = 5000 # May go over slightly; numbers chosen at random +kPruneDownTo = 2500 class lookupResult(object): #__slots__=("qType","answer","question","expiresAt","lastUsed") - def __init__(self,qType,answer,question,expiresAt,now): - self.qType=qType - self.answer=answer - self.question=question - self.expiresAt=expiresAt - self.lastUsed=now + def __init__(self, qType, answer, question, expiresAt, now): + self.qType = qType + self.answer = answer + self.question = question + self.expiresAt = expiresAt + self.lastUsed = now return None @@ -68,9 +66,9 @@ class cache: - def __init__(self,dnsServer=None,cachefile=""): + def __init__(self, dnsServer=None, cachefile=""): # These attributes intended for user setting - self.printStatsAtEnd=False + self.printStatsAtEnd = False # As far as I can tell from the standards, # it's legal to have more than one PTR record @@ -83,7 +81,7 @@ # lookups always return a list. Reverse # ("PTR") lookups return a single name unless # this attribute is set to False. - self.returnSinglePTR=True + self.returnSinglePTR = True # How long to cache an error as no data self.cacheErrorSecs=5*60 @@ -98,7 +96,7 @@ if self.cachefile and os.path.exists(self.cachefile): try: - self.caches = pickle.load(open(self.cachefile, "rb")) + self.caches = pickle_read(self.cachefile) except: os.unlink(self.cachefile) @@ -118,19 +116,18 @@ self.misses=0 self.pruneTicker=0 - if dnsServer==None: + if dnsServer == None: DNS.DiscoverNameServers() - self.queryObj=DNS.DnsRequest() + self.queryObj = DNS.DnsRequest() else: - self.queryObj=DNS.DnsRequest(server=dnsServer) + self.queryObj = DNS.DnsRequest(server=dnsServer) return None def close(self): if self.printStatsAtEnd: self.printStats() if self.cachefile: - from storage import safe_pickle - safe_pickle(self.cachefile, self.caches) + pickle_write(self.cachefile, self.caches) def printStats(self): for key,val in self.caches.items(): @@ -139,30 +136,30 @@ totAnswers+=len(item) print >> sys.stderr, "cache", key, "has", len(self.caches[key]), print >> sys.stderr, "question(s) and", totAnswers, "answer(s)" - if self.hits+self.misses==0: + if self.hits+self.misses == 0: print >> sys.stderr, "No queries" else: print >> sys.stderr, self.hits, "hits,", self.misses, "misses", print >> sys.stderr, "(%.1f%% hits)" % \ (self.hits/float(self.hits+self.misses)*100) - def prune(self,now): + def prune(self, now): # I want this to be as fast as reasonably possible. # If I didn't, I'd probably do various things differently # Is there a faster way to do this? - allAnswers=[] + allAnswers = [] for cache in self.caches.values(): for val in cache.values(): allAnswers += val - allAnswers=sort_by_attr(allAnswers,"expiresAt") + allAnswers = sort_by_attr(allAnswers,"expiresAt") allAnswers.reverse() while True: - if allAnswers[-1].expiresAt>now: + if allAnswers[-1].expiresAt > now: break - answer=allAnswers.pop() - c=self.caches[answer.qType] + answer = allAnswers.pop() + c = self.caches[answer.qType] c[answer.question].remove(answer) if not c[answer.question]: del c[answer.question] @@ -177,12 +174,12 @@ # some entries least-recently-used-wise. I'm not by any means # sure that this is the best strategy, but as yet I don't have # data to test different strategies. - allAnswers=sort_by_attr(allAnswers,"lastUsed") + allAnswers = sort_by_attr(allAnswers, "lastUsed") allAnswers.reverse() - numToDelete=len(allAnswers)-kPruneDownTo - for count in range(numToDelete): - answer=allAnswers.pop() - c=self.caches[answer.qType] + numToDelete = len(allAnswers)-kPruneDownTo + for _count in xrange(numToDelete): + answer = allAnswers.pop() + c = self.caches[answer.qType] c[answer.question].remove(answer) if not c[answer.question]: del c[answer.question] @@ -190,86 +187,88 @@ return None - def formatForReturn(self,listOfObjs): - if len(listOfObjs)==1 and listOfObjs[0].answer==None: + def formatForReturn(self, listOfObjs): + if len(listOfObjs) == 1 and listOfObjs[0].answer == None: return [] - if listOfObjs[0].qType=="PTR" and self.returnSinglePTR: + if listOfObjs[0].qType == "PTR" and self.returnSinglePTR: return listOfObjs[0].answer return [ obj.answer for obj in listOfObjs ] def lookup(self,question,qType="A"): - qType=qType.upper() + qType = qType.upper() if qType not in ("A","PTR"): raise ValueError,"Query type must be one of A, PTR" - now=int(time.time()) + now = int(time.time()) # Finding the len() of a dictionary isn't an expensive operation # but doing it twice for every lookup isn't necessary. - self.pruneTicker+=1 - if self.pruneTicker==kCheckForPruneEvery: - self.pruneTicker=0 + self.pruneTicker += 1 + if self.pruneTicker == kCheckForPruneEvery: + self.pruneTicker = 0 if len(self.caches["A"])+len(self.caches["PTR"])>kPruneThreshold: self.prune(now) - cacheToLookIn=self.caches[qType] + cacheToLookIn = self.caches[qType] try: - answers=cacheToLookIn[question] + answers = cacheToLookIn[question] except KeyError: pass else: if answers: - ind=0 + ind = 0 # No guarantee that expire has already been done while ind> sys.stderr, "lookup failure:", question if not answers: del cacheToLookIn[question] else: - self.hits+=1 + self.hits += 1 return self.formatForReturn(answers) # Not in cache or we just expired it - self.misses+=1 + self.misses += 1 - if qType=="PTR": - qList=question.split(".") + if qType == "PTR": + qList = question.split(".") qList.reverse() - queryQuestion=".".join(qList)+".in-addr.arpa" + queryQuestion = ".".join(qList)+".in-addr.arpa" else: - queryQuestion=question + queryQuestion = question # where do we get NXDOMAIN? try: - reply=self.queryObj.req(queryQuestion,qtype=qType,timeout=self.dnsTimeout) + reply = self.queryObj.req(queryQuestion, qtype=qType, + timeout=self.dnsTimeout) except DNS.Base.DNSError,detail: - if detail.args[0]<>"Timeout": + if detail.args[0] != "Timeout": print >> sys.stderr, "Error, fixme", detail print >> sys.stderr, "Question was", queryQuestion print >> sys.stderr, "Original question was", question print >> sys.stderr, "Type was", qType - objs=[ lookupResult(qType,None,question,self.cacheErrorSecs+now,now) ] - cacheToLookIn[question]=objs # Add to format for return? + objs = [lookupResult(qType, None, question, + self.cacheErrorSecs+now, now)] + cacheToLookIn[question] = objs # Add to format for return? return self.formatForReturn(objs) except socket.gaierror,detail: print >> sys.stderr, "DNS connection failure:", self.queryObj.ns, detail print >> sys.stderr, "Defaults:", DNS.defaults - objs=[] + objs = [] for answer in reply.answers: - if answer["typename"]==qType: + if answer["typename"] == qType: # PyDNS returns TTLs as longs but RFC 1035 says that the TTL # value is a signed 32-bit value and must be positive, so it # should be safe to coerce it to a Python integer. And @@ -277,22 +276,24 @@ # (68 years and change) is drunk. Arguably, I ought to # impose a maximum rather than continuing with longs # (int(long) returns long in recent versions of Python). - ttl=max(min(int(answer["ttl"]),kMaxTTL),kMinTTL) + ttl = max(min(int(answer["ttl"]), kMaxTTL), kMinTTL) # RFC 2308 says that you should cache an NXDOMAIN for the # minimum of the minimum field of the SOA record and the TTL # of the SOA. - if ttl>0: - item=lookupResult(qType,answer["data"],question,ttl+now,now) + if ttl > 0: + item = lookupResult(qType, answer["data"], question, + ttl+now, now) objs.append(item) if objs: - cacheToLookIn[question]=objs + cacheToLookIn[question] = objs return self.formatForReturn(objs) # Probably SERVFAIL or the like if not reply.authority: - objs=[ lookupResult(qType,None,question,self.cacheErrorSecs+now,now) ] - cacheToLookIn[question]=objs + objs = [lookupResult(qType, None, question, + self.cacheErrorSecs+now, now)] + cacheToLookIn[question] = objs return self.formatForReturn(objs) @@ -303,44 +304,44 @@ # # RFC 2308 specifies that this how to decide how long to cache an # NXDOMAIN. - auth=reply.authority[0] - auTTL=int(auth["ttl"]) + auth = reply.authority[0] + auTTL = int(auth["ttl"]) for item in auth["data"]: - if type(item)==types.TupleType and item[0]=="minimum": - auMin=int(item[1]) - cacheNeg=min(auMin,auTTL) + if type(item) == types.TupleType and item[0] == "minimum": + auMin = int(item[1]) + cacheNeg = min(auMin,auTTL) break else: - cacheNeg=auTTL - objs=[ lookupResult(qType,None,question,cacheNeg+now,now) ] + cacheNeg = auTTL + objs = [lookupResult(qType, None, question, cacheNeg+now, now)] - cacheToLookIn[question]=objs + cacheToLookIn[question] = objs return self.formatForReturn(objs) def main(): import transaction - c=cache(cachefile=os.path.expanduser("~/.dnscache")) - c.printStatsAtEnd=True + c = cache(cachefile=os.path.expanduser("~/.dnscache")) + c.printStatsAtEnd = True for host in ["www.python.org", "www.timsbloggers.com", "www.seeputofor.com", "www.completegarbage.tv", "www.tradelinkllc.com"]: print >> sys.stderr, "checking", host - now=time.time() - ips=c.lookup(host) - print >> sys.stderr, ips,time.time()-now - now=time.time() - ips=c.lookup(host) - print >> sys.stderr, ips,time.time()-now + now = time.time() + ips = c.lookup(host) + print >> sys.stderr, ips, time.time()-now + now = time.time() + ips = c.lookup(host) + print >> sys.stderr, ips, time.time()-now if ips: - ip=ips[0] - now=time.time() - name=c.lookup(ip,qType="PTR") - print >> sys.stderr, name,time.time()-now - now=time.time() - name=c.lookup(ip,qType="PTR") - print >> sys.stderr, name,time.time()-now + ip = ips[0] + now = time.time() + name = c.lookup(ip, qType="PTR") + print >> sys.stderr, name, time.time()-now + now = time.time() + name = c.lookup(ip, qType="PTR") + print >> sys.stderr, name, time.time()-now else: print >> sys.stderr, "unknown" @@ -348,5 +349,5 @@ return None -if __name__=="__main__": +if __name__ == "__main__": main() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:06:00 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:06:00 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3199] trunk/spambayes/spambayes/MoinSecurityPolicy. py Message-ID: Revision: 3199 http://spambayes.svn.sourceforge.net/spambayes/?rev=3199&view=rev Author: montanaro Date: 2008-11-25 02:06:00 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/MoinSecurityPolicy.py Modified: trunk/spambayes/spambayes/MoinSecurityPolicy.py =================================================================== --- trunk/spambayes/spambayes/MoinSecurityPolicy.py 2008-11-25 02:05:34 UTC (rev 3198) +++ trunk/spambayes/spambayes/MoinSecurityPolicy.py 2008-11-25 02:06:00 UTC (rev 3199) @@ -48,7 +48,7 @@ from MoinMoin.PageEditor import PageEditor from spambayes import hammie, storage -from spambayes.tokenizer import Tokenizer, numeric_entity_re, \ +from spambayes.tokenizer import Tokenizer as _Tokenizer, numeric_entity_re, \ numeric_entity_replacer, crack_urls, breaking_entity_re, html_re, \ tokenize_word @@ -191,7 +191,7 @@ """ % locals() sendmail(request, emails, subject, text) -class Tokenizer(Tokenizer): +class Tokenizer(_Tokenizer): def tokenize(self, text): """Tokenize a chunk of text. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:06:37 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:06:37 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3200] trunk/spambayes/spambayes/OptionsClass.py Message-ID: Revision: 3200 http://spambayes.svn.sourceforge.net/spambayes/?rev=3200&view=rev Author: montanaro Date: 2008-11-25 02:06:37 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/OptionsClass.py Modified: trunk/spambayes/spambayes/OptionsClass.py =================================================================== --- trunk/spambayes/spambayes/OptionsClass.py 2008-11-25 02:06:00 UTC (rev 3199) +++ trunk/spambayes/spambayes/OptionsClass.py 2008-11-25 02:06:37 UTC (rev 3200) @@ -683,11 +683,6 @@ # For the moment, this will do. Use a real mail client, for # goodness sake! if sect == "Headers" and opt in ("notate_to", "notate_subject"): - header_strings = (self.get("Headers", "header_ham_string"), - self.get("Headers", - "header_spam_string"), - self.get("Headers", - "header_unsure_string")) self._options[sect, opt.lower()].set(val) return if self.is_valid(sect, opt, val): @@ -860,9 +855,10 @@ # where number represents the number of CHAR8 octets # but this is too complex for us at the moment. IMAP_ASTRING = [] -for i in range(1, 128): - if not chr(i) in ['"', '\\', '\n', '\r']: +for _i in xrange(1, 128): + if chr(i) not in ['"', '\\', '\n', '\r']: IMAP_ASTRING.append(chr(i)) +del _i IMAP_ASTRING = r"\"?[" + re.escape(''.join(IMAP_ASTRING)) + r"]+\"?" # Similarly, each option must specify whether it should be reset to This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:07:24 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:07:24 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3201] trunk/spambayes/spambayes/message.py Message-ID: Revision: 3201 http://spambayes.svn.sourceforge.net/spambayes/?rev=3201&view=rev Author: montanaro Date: 2008-11-25 02:07:24 +0000 (Tue, 25 Nov 2008) Log Message: ----------- use safepickle functions pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/message.py Modified: trunk/spambayes/spambayes/message.py =================================================================== --- trunk/spambayes/spambayes/message.py 2008-11-25 02:06:37 UTC (rev 3200) +++ trunk/spambayes/spambayes/message.py 2008-11-25 02:07:24 UTC (rev 3201) @@ -81,7 +81,6 @@ def bool(val): return not not val -import os import sys import types import time @@ -90,13 +89,9 @@ import errno import shelve import warnings -try: - import cPickle as pickle -except ImportError: - import pickle +import cPickle as pickle import traceback -import email import email.Message import email.Parser import email.Header @@ -104,8 +99,9 @@ from spambayes import storage from spambayes import dbmstorage -from spambayes.Options import options, get_pathname_option from spambayes.tokenizer import tokenize +from spambayes.Options import options +from spambayes.safepickle import pickle_read, pickle_write try: import cStringIO as StringIO @@ -220,25 +216,20 @@ def load(self): try: - fp = open(self.db_name, 'rb') + self.db = pickle_read(self.db_name) except IOError, e: if e.errno == errno.ENOENT: # New pickle self.db = {} else: raise - else: - self.db = pickle.load(fp) - fp.close() def close(self): # we keep no resources open - nothing to do pass def store(self): - fp = open(self.db_name, 'wb') - pickle.dump(self.db, fp, self.mode) - fp.close() + pickle_write(self.db_name, self.db, self.mode) class MessageInfoDB(MessageInfoBase): def __init__(self, db_name, mode='c'): @@ -264,7 +255,8 @@ def close(self): # Close our underlying database. Better not assume all databases # have close functions! - def noop(): pass + def noop(): + pass getattr(self.db, "close", noop)() getattr(self.dbm, "close", noop)() @@ -403,7 +395,8 @@ def setId(self, id): if self.id and self.id != id: - raise ValueError, "MsgId has already been set, cannot be changed" + `self.id` + `id` + raise ValueError, ("MsgId has already been set," + " cannot be changed %r %r") % (self.id, id) if id is None: raise ValueError, "MsgId must not be None" @@ -453,22 +446,22 @@ def GetClassification(self): if self.c == PERSISTENT_SPAM_STRING: - return options['Headers','header_spam_string'] + return options['Headers', 'header_spam_string'] elif self.c == PERSISTENT_HAM_STRING: - return options['Headers','header_ham_string'] + return options['Headers', 'header_ham_string'] elif self.c == PERSISTENT_UNSURE_STRING: - return options['Headers','header_unsure_string'] + return options['Headers', 'header_unsure_string'] return None def RememberClassification(self, cls): # this must store state independent of options settings, as they # may change, which would really screw this database up - if cls == options['Headers','header_spam_string']: + if cls == options['Headers', 'header_spam_string']: self.c = PERSISTENT_SPAM_STRING - elif cls == options['Headers','header_ham_string']: + elif cls == options['Headers', 'header_ham_string']: self.c = PERSISTENT_HAM_STRING - elif cls == options['Headers','header_unsure_string']: + elif cls == options['Headers', 'header_unsure_string']: self.c = PERSISTENT_UNSURE_STRING else: raise ValueError, \ @@ -508,19 +501,19 @@ def setIdFromPayload(self): try: - self.setId(self[options['Headers','mailid_header_name']]) + self.setId(self[options['Headers', 'mailid_header_name']]) except ValueError: return None return self.id def setDisposition(self, prob): - if prob < options['Categorization','ham_cutoff']: - disposition = options['Headers','header_ham_string'] - elif prob > options['Categorization','spam_cutoff']: - disposition = options['Headers','header_spam_string'] + if prob < options['Categorization', 'ham_cutoff']: + disposition = options['Headers', 'header_ham_string'] + elif prob > options['Categorization', 'spam_cutoff']: + disposition = options['Headers', 'header_spam_string'] else: - disposition = options['Headers','header_unsure_string'] + disposition = options['Headers', 'header_unsure_string'] self.RememberClassification(disposition) def addSBHeaders(self, prob, clues): @@ -528,26 +521,26 @@ add optional headers if needed.""" self.setDisposition(prob) disposition = self.GetClassification() - self[options['Headers','classification_header_name']] = disposition + self[options['Headers', 'classification_header_name']] = disposition - if options['Headers','include_score']: + if options['Headers', 'include_score']: disp = "%.*f" % (options["Headers", "header_score_digits"], prob) if options["Headers", "header_score_logarithm"]: - if prob<=0.005 and prob>0.0: - x=-math.log10(prob) - disp += " (%d)"%x - if prob>=0.995 and prob<1.0: - x=-math.log10(1.0-prob) - disp += " (%d)"%x - self[options['Headers','score_header_name']] = disp + if prob <= 0.005 and prob > 0.0: + x = -math.log10(prob) + disp += " (%d)" % x + if prob >= 0.995 and prob < 1.0: + x = -math.log10(1.0-prob) + disp += " (%d)" % x + self[options['Headers', 'score_header_name']] = disp - if options['Headers','include_thermostat']: + if options['Headers', 'include_thermostat']: thermostat = '**********' - self[options['Headers','thermostat_header_name']] = \ + self[options['Headers', 'thermostat_header_name']] = \ thermostat[:int(prob*10)] - if options['Headers','include_evidence']: - hco = options['Headers','clue_mailheader_cutoff'] + if options['Headers', 'include_evidence']: + hco = options['Headers', 'clue_mailheader_cutoff'] sco = 1 - hco evd = [] for word, score in clues: @@ -565,7 +558,7 @@ # use email.Header.Header because that can explode with unencoded # non-ASCII characters. We can't use textwrap because that's 2.3. wrappedEvd = [] - headerName = options['Headers','evidence_header_name'] + headerName = options['Headers', 'evidence_header_name'] lineLength = len(headerName) + len(': ') for component, index in zip(evd, range(len(evd))): wrappedEvd.append(component) @@ -578,8 +571,8 @@ lineLength = 8 self[headerName] = "".join(wrappedEvd) - if options['Headers','add_unique_id']: - self[options['Headers','mailid_header_name']] = self.id + if options['Headers', 'add_unique_id']: + self[options['Headers', 'mailid_header_name']] = self.id self.addNotations() @@ -669,13 +662,14 @@ SpamBayes headers. This can be used to restore the values after using the delSBHeaders() function.""" headers = {} - for header_name in [options['Headers','classification_header_name'], - options['Headers','mailid_header_name'], - options['Headers','classification_header_name'] + "-ID", - options['Headers','thermostat_header_name'], - options['Headers','evidence_header_name'], - options['Headers','score_header_name'], - options['Headers','trained_header_name'], + for header_name in [options['Headers', 'classification_header_name'], + options['Headers', 'mailid_header_name'], + (options['Headers', 'classification_header_name'] + + "-ID"), + options['Headers', 'thermostat_header_name'], + options['Headers', 'evidence_header_name'], + options['Headers', 'score_header_name'], + options['Headers', 'trained_header_name'], ]: value = self[header_name] if value is not None: @@ -683,13 +677,13 @@ return headers def delSBHeaders(self): - del self[options['Headers','classification_header_name']] - del self[options['Headers','mailid_header_name']] - del self[options['Headers','classification_header_name'] + "-ID"] # test mode header - del self[options['Headers','thermostat_header_name']] - del self[options['Headers','evidence_header_name']] - del self[options['Headers','score_header_name']] - del self[options['Headers','trained_header_name']] + del self[options['Headers', 'classification_header_name']] + del self[options['Headers', 'mailid_header_name']] + del self[options['Headers', 'classification_header_name'] + "-ID"] # test mode header + del self[options['Headers', 'thermostat_header_name']] + del self[options['Headers', 'evidence_header_name']] + del self[options['Headers', 'score_header_name']] + del self[options['Headers', 'trained_header_name']] # Also delete notations - typically this is called just before # training, and we don't want them there for that. self.delNotations() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:07:43 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:07:43 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3202] trunk/spambayes/spambayes/Corpus.py Message-ID: Revision: 3202 http://spambayes.svn.sourceforge.net/spambayes/?rev=3202&view=rev Author: montanaro Date: 2008-11-25 02:07:43 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint Modified Paths: -------------- trunk/spambayes/spambayes/Corpus.py Modified: trunk/spambayes/spambayes/Corpus.py =================================================================== --- trunk/spambayes/spambayes/Corpus.py 2008-11-25 02:07:24 UTC (rev 3201) +++ trunk/spambayes/spambayes/Corpus.py 2008-11-25 02:07:43 UTC (rev 3202) @@ -276,4 +276,4 @@ if __name__ == '__main__': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:08:00 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:08:00 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3203] trunk/spambayes/spambayes/FileCorpus.py Message-ID: Revision: 3203 http://spambayes.svn.sourceforge.net/spambayes/?rev=3203&view=rev Author: montanaro Date: 2008-11-25 02:08:00 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/FileCorpus.py Modified: trunk/spambayes/spambayes/FileCorpus.py =================================================================== --- trunk/spambayes/spambayes/FileCorpus.py 2008-11-25 02:07:43 UTC (rev 3202) +++ trunk/spambayes/spambayes/FileCorpus.py 2008-11-25 02:08:00 UTC (rev 3203) @@ -92,7 +92,7 @@ raise ValueError if options["globals", "verbose"]: - print 'adding',message.key(),'to corpus' + print 'adding', message.key(), 'to corpus' message.directory = self.directory message.store() @@ -103,7 +103,7 @@ def removeMessage(self, message, observer_flags=0): '''Remove a Message from this corpus''' if options["globals", "verbose"]: - print 'removing',message.key(),'from corpus' + print 'removing', message.key(), 'from corpus' message.remove() @@ -243,7 +243,7 @@ def remove(self): '''Message hara-kiri''' if options["globals", "verbose"]: - print 'physically deleting file',self.pathname() + print 'physically deleting file', self.pathname() try: os.unlink(self.pathname()) except OSError: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:12:00 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:12:00 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3205] trunk/spambayes/spambayes/OptionsClass.py Message-ID: Revision: 3205 http://spambayes.svn.sourceforge.net/spambayes/?rev=3205&view=rev Author: montanaro Date: 2008-11-25 02:11:59 +0000 (Tue, 25 Nov 2008) Log Message: ----------- too fast on the trigger finger Modified Paths: -------------- trunk/spambayes/spambayes/OptionsClass.py Modified: trunk/spambayes/spambayes/OptionsClass.py =================================================================== --- trunk/spambayes/spambayes/OptionsClass.py 2008-11-25 02:10:28 UTC (rev 3204) +++ trunk/spambayes/spambayes/OptionsClass.py 2008-11-25 02:11:59 UTC (rev 3205) @@ -856,8 +856,8 @@ # but this is too complex for us at the moment. IMAP_ASTRING = [] for _i in xrange(1, 128): - if chr(i) not in ['"', '\\', '\n', '\r']: - IMAP_ASTRING.append(chr(i)) + if chr(_i) not in ['"', '\\', '\n', '\r']: + IMAP_ASTRING.append(chr(_i)) del _i IMAP_ASTRING = r"\"?[" + re.escape(''.join(IMAP_ASTRING)) + r"]+\"?" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 03:10:29 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 02:10:29 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3204] trunk/spambayes Message-ID: Revision: 3204 http://spambayes.svn.sourceforge.net/spambayes/?rev=3204&view=rev Author: montanaro Date: 2008-11-25 02:10:28 +0000 (Tue, 25 Nov 2008) Log Message: ----------- use safepickle functions, pylint nits Modified Paths: -------------- trunk/spambayes/CHANGELOG.txt trunk/spambayes/contrib/SmarterHTTPServer.py trunk/spambayes/contrib/bulkgraph.py trunk/spambayes/contrib/findbest.py trunk/spambayes/contrib/mod_spambayes.py trunk/spambayes/contrib/nway.py trunk/spambayes/contrib/pycksum.py trunk/spambayes/contrib/sb_culler.py trunk/spambayes/contrib/spamcounts.py trunk/spambayes/contrib/tte.py trunk/spambayes/pspam/pop.py trunk/spambayes/pspam/scoremsg.py trunk/spambayes/pspam/update.py trunk/spambayes/scripts/core_server.py trunk/spambayes/scripts/sb_bnfilter.py trunk/spambayes/scripts/sb_bnserver.py trunk/spambayes/scripts/sb_client.py trunk/spambayes/scripts/sb_dbexpimp.py trunk/spambayes/scripts/sb_imapfilter.py trunk/spambayes/scripts/sb_mailsort.py trunk/spambayes/scripts/sb_mboxtrain.py trunk/spambayes/scripts/sb_notesfilter.py trunk/spambayes/scripts/sb_pop3dnd.py trunk/spambayes/scripts/sb_server.py trunk/spambayes/scripts/sb_upload.py trunk/spambayes/scripts/sb_xmlrpcserver.py trunk/spambayes/setup.py trunk/spambayes/spambayes/ImapUI.py trunk/spambayes/spambayes/Options.py trunk/spambayes/spambayes/Version.py trunk/spambayes/spambayes/__init__.py trunk/spambayes/spambayes/chi2.py trunk/spambayes/spambayes/optimize.py trunk/spambayes/spambayes/storage.py trunk/spambayes/spambayes/tokenizer.py trunk/spambayes/testtools/es2hs.py trunk/spambayes/testtools/incremental.py trunk/spambayes/utilities/HistToGNU.py trunk/spambayes/utilities/convert_config_file.py trunk/spambayes/utilities/convert_db.py trunk/spambayes/utilities/extractmessages.py trunk/spambayes/utilities/hammer.py trunk/spambayes/utilities/loosecksum.py trunk/spambayes/utilities/mboxcount.py trunk/spambayes/utilities/mkreversemap.py trunk/spambayes/utilities/split.py trunk/spambayes/utilities/splitn.py trunk/spambayes/utilities/splitndirs.py trunk/spambayes/windows/autoconfigure.py trunk/spambayes/windows/pop3proxy_service.py Modified: trunk/spambayes/CHANGELOG.txt =================================================================== --- trunk/spambayes/CHANGELOG.txt 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/CHANGELOG.txt 2008-11-25 02:10:28 UTC (rev 3204) @@ -1,5 +1,11 @@ [Note that all dates are in ISO 8601 format, e.g. YYYY-MM-DD to ease sorting] +Release 1.1b1 +============= + +Skip Montanaro 2008-11-23 Route all pickle reads and writes through safepickle module. +Skip Montanaro 2008-11-23 Pick off a bunch of pylint nit (still tons to do). + Release 1.1a5 ============= Modified: trunk/spambayes/contrib/SmarterHTTPServer.py =================================================================== --- trunk/spambayes/contrib/SmarterHTTPServer.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/contrib/SmarterHTTPServer.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -20,7 +20,6 @@ import SimpleHTTPServer import urllib import cgi -import shutil import mimetypes import re try: Modified: trunk/spambayes/contrib/bulkgraph.py =================================================================== --- trunk/spambayes/contrib/bulkgraph.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/contrib/bulkgraph.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -32,15 +32,15 @@ quiet mode; no output """ -import mboxutils import getopt -import hammie import sys import os import re import time import filecmp +from spambayes import mboxutils, hammie + program = sys.argv[0] loud = True day = 24 * 60 * 60 Modified: trunk/spambayes/contrib/findbest.py =================================================================== --- trunk/spambayes/contrib/findbest.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/contrib/findbest.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -66,7 +66,6 @@ import sys import os -import cPickle as pickle import getopt import math @@ -75,6 +74,8 @@ from spambayes.hammie import Hammie from spambayes.tokenizer import tokenize from spambayes.Options import options +from spambayes import storage +from spambayes.safepickle import pickle_read, pickle_write cls = Classifier() h = Hammie(cls) @@ -98,7 +99,6 @@ def score(unsure, h, cls, scores, msgids=None, skipspam=False): """See what effect on others each msg in unsure has""" - ham_cutoff = options["Categorization", "ham_cutoff"] spam_cutoff = options["Categorization", "spam_cutoff"] # compute a base - number of messages in unsure already in the @@ -223,7 +223,7 @@ print "scoring" if best: - last_scores = pickle.load(file(bestfile)) + last_scores = pickle_read(bestfile) last_scores = last_scores.items() last_scores.sort() msgids = set() @@ -240,7 +240,7 @@ pass if not best: - pickle.dump(scores, file(bestfile, 'w')) + pickle_write(bestfile, scores) return 0 Modified: trunk/spambayes/contrib/mod_spambayes.py =================================================================== --- trunk/spambayes/contrib/mod_spambayes.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/contrib/mod_spambayes.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -5,26 +5,24 @@ ## Author: Skip Montanaro ## -import os - from proxy3_filter import * import proxy3_options -from spambayes import hammie, Options, mboxutils +from spambayes import hammie, Options dbf = Options.get_pathname_option("Storage", "persistent_storage_file") class SpambayesFilter(BufferAllFilter): - hammie = hammie.open(dbf, 1, 'r') + checker = hammie.open(dbf, 1, 'r') def filter(self, s): if self.reply.split()[1] == '200': - prob = self.hammie.score("%s\r\n%s" % (self.serverheaders, s)) + prob = self.checker.score("%s\r\n%s" % (self.serverheaders, s)) print "| prob: %.5f" % prob if prob >= Options.options["Categorization", "spam_cutoff"]: print self.serverheaders print "text:", s[0:40], "...", s[-40:] return "not authorized" - return s + return s from proxy3_util import * Modified: trunk/spambayes/contrib/nway.py =================================================================== --- trunk/spambayes/contrib/nway.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/contrib/nway.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -70,7 +70,7 @@ prog = os.path.basename(sys.argv[0]) -def help(): +def usage(): print >> sys.stderr, __doc__ % globals() def main(args): @@ -78,10 +78,9 @@ for opt, arg in opts: if opt == '-h': - help() + usage() return 0 - tagdb_list = [] msg = mboxutils.get_message(sys.stdin) try: del msg["X-Spambayes-Classification"] Modified: trunk/spambayes/contrib/pycksum.py =================================================================== --- trunk/spambayes/contrib/pycksum.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/contrib/pycksum.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -97,12 +97,12 @@ body = text.split("\n\n", 1)[1] lines = clean(body).split("\n") chunksize = len(lines)//4+1 - sum = [] + digest = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) - sum.append(md5.new(chunk).hexdigest()) + digest.append(md5.new(chunk).hexdigest()) - return ".".join(sum) + return ".".join(digest) def save_checksum(cksum, f): pieces = cksum.split('.') @@ -118,12 +118,12 @@ if not db.has_key(subsum): db[subsum] = str(time.time()) if len(db) > maxdblen: - items = [(float(db[k]),k) for k in db.keys()] + items = [(float(db[k]), k) for k in db.keys()] items.sort() # the -20 brings us down a bit below the max so we aren't # constantly running this chunk of code items = items[:-(maxdblen-20)] - for v,k in items: + for v, k in items: del db[k] else: result = 0 Modified: trunk/spambayes/contrib/sb_culler.py =================================================================== --- trunk/spambayes/contrib/sb_culler.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/contrib/sb_culler.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -30,20 +30,23 @@ This program requires Python 2.3 or newer. """ -import sets, traceback, md5, os +import socket +socket.setdefaulttimeout(10) + +import traceback, md5, os import poplib import posixpath + +import sets from email import Header, Utils from spambayes import mboxutils, hammie +from spambayes.Options import options -import socket -socket.setdefaulttimeout(10) - DO_ACTIONS = 1 VERBOSE_LEVEL = 1 APPEND_TO_FILE = "append_to_file" -DELETE = "delete" +DELETE_FROM_MAILBOX = "delete" KEEP_IN_MAILBOX = "keep in mailbox" SPAM = "spam" VIRUS = "virus" @@ -108,7 +111,7 @@ def DELETE(mi, log): """Action: delete message from mailbox""" - log.do_action(DELETE) + log.do_action(DELETE_FROM_MAILBOX) if not DO_ACTIONS: return mi.mailbox.dele(mi.i) Modified: trunk/spambayes/contrib/spamcounts.py =================================================================== --- trunk/spambayes/contrib/spamcounts.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/contrib/spamcounts.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -19,13 +19,11 @@ import getopt import re import sets -import os -import shelve import csv -from spambayes.Options import options, get_pathname_option +from spambayes.Options import options from spambayes.tokenizer import tokenize -from spambayes.storage import STATE_KEY, database_type, open_storage +from spambayes.storage import database_type, open_storage prog = sys.argv[0] Modified: trunk/spambayes/contrib/tte.py =================================================================== --- trunk/spambayes/contrib/tte.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/contrib/tte.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -100,7 +100,7 @@ def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose, ratio): - smisses = hmisses = round = 0 + round = 0 ham_cutoff = Options.options["Categorization", "ham_cutoff"] spam_cutoff = Options.options["Categorization", "spam_cutoff"] @@ -114,19 +114,19 @@ hambone_ = list(reversed(hambone_)) spamcan_ = list(reversed(spamcan_)) - nspam,nham = len(spamcan_),len(hambone_) + nspam, nham = len(spamcan_), len(hambone_) if ratio: - rspam,rham = ratio + rspam, rham = ratio # If the actual ratio of spam to ham in the database is better than # what was asked for, use that better ratio. if (rspam > rham) == (rspam * nham > rham * nspam): - rspam,rham = nspam,nham + rspam, rham = nspam, nham # define some indexing constants ham = 0 spam = 1 name = ('ham','spam') - misses = [0,0] + misses = [0, 0] misclassified = lambda is_spam, score: ( is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff) @@ -140,9 +140,9 @@ hambone = iter(hambone_) spamcan = iter(spamcan_) - i = [0,0] + i = [0, 0] msgs_processed = 0 - misses = [0,0] + misses = [0, 0] training_sets = [hambone, spamcan] while not maxmsgs or msgs_processed < maxmsgs: @@ -153,7 +153,7 @@ try: train_msg = training_sets[train_spam].next() except StopIteration: - break; + break i[train_spam] += 1 msgs_processed += 1 @@ -164,7 +164,7 @@ score = store.spamprob(tokens) selector = train_msg["message-id"] or train_msg["subject"] - if misclassified(train_spam,score) and selector is not None: + if misclassified(train_spam, score) and selector is not None: if verbose: print >> sys.stderr, "\tmiss %s: %.6f %s" % ( name[train_spam], score, selector) @@ -179,24 +179,25 @@ print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \ (round, msgs_processed, misses[0], misses[1], seconds) - training_sets = [hambone,spamcan] + training_sets = [hambone, spamcan] # We count all untrained messages so the user knows what was skipped. # We also tag them for saving so we don't lose messages which might have # value in a future run - for is_spam in ham,spam: + for is_spam in ham, spam: nleft = 0 try: while True: msg = training_sets[is_spam].next() score = store.spamprob(tokenize(msg)) - if misclassified(is_spam,score): + if misclassified(is_spam, score): tdict[msg["message-id"]] = True nleft += 1 except StopIteration: - if nleft: print nleft, "untrained %ss" % name[is_spam] + if nleft: + print nleft, "untrained %ss" % name[is_spam] def cull(mbox_name, cullext, designation, tdict): print "writing new %s mbox..." % designation Modified: trunk/spambayes/pspam/pop.py =================================================================== --- trunk/spambayes/pspam/pop.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/pspam/pop.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -27,7 +27,6 @@ """ import SocketServer -import asyncore try: import cStringIO as StringIO except ImportError: @@ -37,11 +36,8 @@ import re import socket import sys -import threading import time -import ZODB -from ZEO.ClientStorage import ClientStorage import zLOG from spambayes.tokenizer import tokenize Modified: trunk/spambayes/pspam/scoremsg.py =================================================================== --- trunk/spambayes/pspam/scoremsg.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/pspam/scoremsg.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -6,11 +6,7 @@ import locale from types import UnicodeType -import ZODB -from ZEO.ClientStorage import ClientStorage - import pspam.database -from spambayes.Options import options from spambayes.tokenizer import tokenize try: Modified: trunk/spambayes/pspam/update.py =================================================================== --- trunk/spambayes/pspam/update.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/pspam/update.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -2,9 +2,6 @@ import os import sys -import ZODB -from ZEO.ClientStorage import ClientStorage - import pspam.database from pspam.profile import Profile Modified: trunk/spambayes/scripts/core_server.py =================================================================== --- trunk/spambayes/scripts/core_server.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/core_server.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -94,11 +94,9 @@ o Graphs. Of something. Who cares what? """ -import sys, getopt, time -from email.Header import Header +import sys, getopt from spambayes import Dibbler -from spambayes import storage from spambayes.Options import options, _ from spambayes.UserInterface import UserInterfaceServer from spambayes.Version import get_current_version Modified: trunk/spambayes/scripts/sb_bnfilter.py =================================================================== --- trunk/spambayes/scripts/sb_bnfilter.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_bnfilter.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -157,40 +157,44 @@ if error: sys.exit(error) -def make_socket(server_options, file): +def make_socket(server_options, filename): refused_count = 0 no_server_count = 0 while 1: try: - s = socket.socket(socket.AF_UNIX,socket.SOCK_STREAM) - s.connect(file) + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect(filename) except socket.error,e: if e[0] == errno.EAGAIN: # baaah pass - elif e[0] == errno.ENOENT or not os.path.exists(file): - # We need to check os.path.exists for use on operating systems that - # never return ENOENT; linux 2.2. + elif e[0] == errno.ENOENT or not os.path.exists(filename): + # We need to check os.path.exists for use on operating + # systems that never return ENOENT; linux 2.2. # # no such file.... no such server. create one. no_server_count += 1 - if no_server_count>4: + if no_server_count > 4: raise + # Reset refused count to start the sleep process over. + # Otherwise we run the risk of waiting a *really* long time + # and/or hitting the refused_count limit. + refused_count = 0 fork_server(server_options) elif e[0] == errno.ECONNREFUSED: # socket file exists but noone listening. refused_count += 1 - if refused_count == 6: + if refused_count == 4: # We have been waiting ages and still havent been able # to connect. Maybe that socket file has got # orphaned. remove it, wait, and try again. We need to # allow enough time for sb_bnserver to initialise the # rest of spambayes try: - os.unlink(file) + os.unlink(filename) except EnvironmentError: pass - elif refused_count>6: + elif refused_count > 6: raise else: raise # some other problem @@ -212,9 +216,9 @@ os.setsid() # Use exec rather than import here because eventually it may be nice to # reimplement this one file in C - os.execv(sys.executable,[sys.executable, - os.path.join(os.path.split(sys.argv[0])[0], - 'sb_bnserver.py') ]+options) + os.execv(sys.executable, [sys.executable, + os.path.join(os.path.split(sys.argv[0])[0], + 'sb_bnserver.py') ]+options) # should never get here sys._exit(1) Modified: trunk/spambayes/scripts/sb_bnserver.py =================================================================== --- trunk/spambayes/scripts/sb_bnserver.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_bnserver.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -27,7 +27,7 @@ unix domain socket used on which we listen """ -import os, getopt, sys, SocketServer, time, traceback, select, socket, errno +import os, getopt, sys, SocketServer, traceback, select, socket, errno try: True, False @@ -63,7 +63,7 @@ try: server = BNServer(args[0], BNRequest) except socket.error,e: - if e[0]==errno.EADDRINUSE: + if e[0] == errno.EADDRINUSE: pass # in use, no need else: raise # a real error @@ -108,7 +108,7 @@ pass def get_request(self): - r,w,e = select.select([self.socket], [], [], self.timeout) + r, w, e = select.select([self.socket], [], [], self.timeout) if r: return self.socket.accept() else: @@ -119,15 +119,16 @@ switches = self.rfile.readline() body = self.rfile.read() try: - response = self._calc_response(switches,body) + response = self._calc_response(switches, body) self.wfile.write('0\n%d\n'%(len(response),)) self.wfile.write(response) except: - response = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] + response = traceback.format_exception_only(sys.exc_info()[0], + sys.exc_info()[1])[0] self.wfile.write('1\n%d\n'%(len(response),)) self.wfile.write(response) - def _calc_response(self,switches,body): + def _calc_response(self, switches, body): switches = switches.split() actions = [] opts, args = getopt.getopt(switches, 'fgstGS') Modified: trunk/spambayes/scripts/sb_client.py =================================================================== --- trunk/spambayes/scripts/sb_client.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_client.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -10,7 +10,7 @@ import xmlrpclib import sys -RPCBASE="http://localhost:65000" +RPCBASE = "http://localhost:65000" def main(): msg = sys.stdin.read() Modified: trunk/spambayes/scripts/sb_dbexpimp.py =================================================================== --- trunk/spambayes/scripts/sb_dbexpimp.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_dbexpimp.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -101,8 +101,7 @@ import spambayes.storage from spambayes.Options import options -import sys, os, getopt, errno, re -import urllib +import sys, os, getopt, errno from types import UnicodeType def uquote(s): @@ -137,8 +136,8 @@ writer = csv.writer(fp) - nham = bayes.nham; - nspam = bayes.nspam; + nham = bayes.nham + nspam = bayes.nspam print "Exporting database %s to file %s" % (dbFN, outFN) print "Database has %s ham, %s spam, and %s words" \ @@ -215,7 +214,7 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'iehmvd:p:f:o:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() useDBM = "pickle" @@ -227,7 +226,7 @@ for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-f': flatFN = arg @@ -247,4 +246,4 @@ if imp: runImport(dbFN, useDBM, newDBM, flatFN) else: - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ Modified: trunk/spambayes/scripts/sb_imapfilter.py =================================================================== --- trunk/spambayes/scripts/sb_imapfilter.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_imapfilter.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -110,7 +110,7 @@ else: temp_dir = win32api.GetTempPath() status = "Log file opened in " + temp_dir - for i in range(3,0,-1): + for i in range(3, 0, -1): try: os.unlink(os.path.join(temp_dir, "SpamBayesIMAP%d.log" % (i+1))) except os.error: @@ -132,20 +132,15 @@ import getopt import types import thread -import traceback import email import email.Parser from getpass import getpass from email.Utils import parsedate -try: - import cStringIO as StringIO -except ImportError: - import StringIO from spambayes import Stats from spambayes import message -from spambayes.Options import options, get_pathname_option, optionsPathname -from spambayes import tokenizer, storage, Dibbler +from spambayes.Options import options, optionsPathname +from spambayes import storage, Dibbler from spambayes.UserInterface import UserInterfaceServer from spambayes.ImapUI import IMAPUserInterface, LoginFailure @@ -177,7 +172,6 @@ timeout = 60 # seconds def __init__(self, server, debug=0, do_expunge = options["imap", "expunge"] ): - if server.find(':') > -1: server, port = server.split(':', 1) port = int(port) @@ -494,7 +488,7 @@ class IMAPMessage(message.SBHeaderMessage): def __init__(self): - message.Message.__init__(self) + message.SBHeaderMessage.__init__(self) self.folder = None self.previous_folder = None self.rfc822_command = "(BODY.PEEK[])" @@ -548,7 +542,7 @@ # Can't select the folder, so getting the substance will not # work. self.could_not_retrieve = True - print >>sys.stderr, "Could not select folder %s for message " \ + print >> sys.stderr, "Could not select folder %s for message " \ "%s (uid %s)" % (self.folder.name, self.id, self.uid) return self @@ -571,7 +565,7 @@ # characters for classification. For now, we just carry on, # warning the user and ignoring the message. self.could_not_retrieve = True - print >>sys.stderr, "MemoryError with message %s (uid %s)" % \ + print >> sys.stderr, "MemoryError with message %s (uid %s)" % \ (self.id, self.uid) return self @@ -614,7 +608,7 @@ self.got_substance = True # Print the exception and a traceback. - print >>sys.stderr, details + print >> sys.stderr, details return self @@ -660,7 +654,7 @@ We can't actually update the message with IMAP, so what we do is create a new message and delete the old one.""" - assert self.folder is not None,\ + assert self.folder is not None, \ "Can't save a message that doesn't have a folder." assert self.id, "Can't save a message that doesn't have an id." assert self.imap_server, "Can't do anything without IMAP connection." @@ -733,7 +727,8 @@ data = self.imap_server.check_response("recent", response) if data[0] is not None: if options["globals", "verbose"]: - print "[imapfilter] found saved message %s in iteration" % self.uid, i + print "[imapfilter] found saved message", self.uid, + print "in iteration", i break else: if options["globals", "verbose"]: @@ -963,7 +958,7 @@ cls = msg.GetClassification() if cls is None or hamfolder is not None: if options["globals", "verbose"]: - print "[imapfilter] classified as %s:"%cls, msg.uid + print "[imapfilter] classified as %s:" % cls, msg.uid msg = msg.get_full_message() if msg.could_not_retrieve: @@ -1140,13 +1135,13 @@ for u in usernames: pwds.append(getpass("Enter password for %s:" % (u,))) - return zip(servers,usernames,pwds) + return zip(servers, usernames, pwds) def run(force_UI=False): try: opts, args = getopt.getopt(sys.argv[1:], 'hbPtcvl:e:i:d:p:o:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() doTrain = False @@ -1159,7 +1154,7 @@ for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == "-b": launchUI = True @@ -1248,7 +1243,7 @@ IMAPSession, stats=stats, close_db=close_db, change_db=change_db)) - launchBrowser=launchUI or options["html_ui", "launch_browser"] + launchBrowser = launchUI or options["html_ui", "launch_browser"] if sleepTime: # Run in a separate thread, as we have more work to do. thread.start_new_thread(Dibbler.run, (), Modified: trunk/spambayes/scripts/sb_mailsort.py =================================================================== --- trunk/spambayes/scripts/sb_mailsort.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_mailsort.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -25,7 +25,7 @@ import time import signal import socket -import email +import errno DB_FILE = os.path.expanduser(DB_FILE) @@ -65,7 +65,7 @@ try: fd = os.open(pathname, os.O_WRONLY|os.O_CREAT|os.O_EXCL, 0600) except IOError, exc: - if exc[i] not in (errno.EINT, errno.EEXIST): + if exc[0] not in (errno.EINT, errno.EEXIST): raise else: break @@ -142,7 +142,7 @@ prob, evidence = bayes.spamprob(tokenize(msg), evidence=True) print msg_name, prob for word, prob in evidence: - print ' ', `word`, prob + print ' ', repr(word), prob def main(): global DB_FILE, CONFIG_FILE Modified: trunk/spambayes/scripts/sb_mboxtrain.py =================================================================== --- trunk/spambayes/scripts/sb_mboxtrain.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_mboxtrain.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -111,7 +111,8 @@ def maildir_train(h, path, is_spam, force, removetrained): """Train bayes with all messages from a maildir.""" - if loud: print " Reading %s as Maildir" % (path,) + if loud: + print " Reading %s as Maildir" % (path,) import time import socket @@ -162,7 +163,8 @@ def mbox_train(h, path, is_spam, force): """Train bayes with a Unix mbox""" - if loud: print " Reading as Unix mbox" + if loud: + print " Reading as Unix mbox" import mailbox import fcntl @@ -219,7 +221,8 @@ def mhdir_train(h, path, is_spam, force): """Train bayes with an mh directory""" - if loud: print " Reading as MH mailbox" + if loud: + print " Reading as MH mailbox" import glob @@ -331,13 +334,15 @@ h = hammie.open(pck, usedb, "c") for g in good: - if loud: print "Training ham (%s):" % g + if loud: + print "Training ham (%s):" % g train(h, g, False, force, trainnew, removetrained) sys.stdout.flush() save = True for s in spam: - if loud: print "Training spam (%s):" % s + if loud: + print "Training spam (%s):" % s train(h, s, True, force, trainnew, removetrained) sys.stdout.flush() save = True Modified: trunk/spambayes/scripts/sb_notesfilter.py =================================================================== --- trunk/spambayes/scripts/sb_notesfilter.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_notesfilter.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -144,14 +144,15 @@ return not not val import sys -from spambayes import tokenizer, storage -from spambayes.Options import options -import cPickle as pickle import errno +import getopt + import win32com.client import pywintypes -import getopt +from spambayes import tokenizer, storage +from spambayes.Options import options +from spambayes.safepickle import pickle_read, pickle_write def classifyInbox(v, vmoveto, bayes, ldbname, notesindex, log): @@ -187,20 +188,18 @@ # probably due to this unicode problem. options["Tokenizer", "generate_long_skips"] = False tokens = tokenizer.tokenize(message) - prob, clues = bayes.spamprob(tokens, evidence=True) + prob = bayes.spamprob(tokens) if prob < options["Categorization", "ham_cutoff"]: - disposition = options["Headers", "header_ham_string"] numham += 1 elif prob > options["Categorization", "spam_cutoff"]: - disposition = options["Headers", "header_spam_string"] docstomove += [doc] numspam += 1 else: - disposition = options["Headers", "header_unsure_string"] numuns += 1 notesindex[nid] = 'classified' + subj = message["subject"] try: print "%s spamprob is %s" % (subj[:30], prob) if log: @@ -305,16 +304,13 @@ bayes = storage.open_storage(bdbname, useDBM) try: - fp = open(idxname, 'rb') + notesindex = pickle_read(idxname) except IOError, e: if e.errno != errno.ENOENT: raise notesindex = {} print "%s file not found, this is a first time run" % (idxname,) print "No classification will be performed" - else: - notesindex = pickle.load(fp) - fp.close() need_replicate = False @@ -378,9 +374,7 @@ bayes.store() - fp = open(idxname, 'wb') - pickle.dump(notesindex, fp) - fp.close() + pickle_write(idxname, notesindex) if log: log.LogAction("Finished running spambayes") @@ -390,7 +384,7 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'htcPd:p:l:r:f:o:i:W:L:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() ldbname = None # local notes database name @@ -405,7 +399,7 @@ for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-l': ldbname = arg @@ -437,9 +431,6 @@ sbfname, doTrain, doClassify, pwd, idxname, logname) if doPrompt: - try: - key = input("Press Enter to end") - except SyntaxError: - pass + raw_input("Press Enter to end ") else: - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ Modified: trunk/spambayes/scripts/sb_pop3dnd.py =================================================================== --- trunk/spambayes/scripts/sb_pop3dnd.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_pop3dnd.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -67,13 +67,11 @@ import md5 import time import errno -import types import email import thread import getopt import socket import imaplib -import operator import email.Utils try: @@ -85,14 +83,12 @@ import twisted.application.app from twisted.internet import defer from twisted.internet import reactor -from twisted.internet.defer import maybeDeferred from twisted.internet.protocol import ServerFactory from twisted.protocols.imap4 import IMessage -from twisted.protocols.imap4 import parseNestedParens, parseIdList -from twisted.protocols.imap4 import IllegalClientResponse, IAccount -from twisted.protocols.imap4 import collapseNestedLists, MessageSet +from twisted.protocols.imap4 import IAccount +from twisted.protocols.imap4 import MessageSet from twisted.protocols.imap4 import IMAP4Server, MemoryAccount, IMailbox -from twisted.protocols.imap4 import IMailboxListener, collapseNestedLists +from twisted.protocols.imap4 import IMailboxListener from spambayes import storage from spambayes import message @@ -101,7 +97,7 @@ from spambayes.tokenizer import tokenize from spambayes import FileCorpus, Dibbler from spambayes.Version import get_current_version -from sb_server import POP3ProxyBase, State, _addressPortStr, _recreateState +from sb_server import POP3ProxyBase, State, _addressPortStr def ensureDir(dirname): """Ensure that the given directory exists - in other words, if it @@ -538,7 +534,7 @@ class SpambayesInbox(SpambayesMailbox): """A special mailbox that holds status messages from SpamBayes.""" def __init__(self, id, state): - IMAPMailbox.__init__(self, "INBOX", "spambayes", id) + SpambayesMailbox.__init__(self, "INBOX", "spambayes", id) self.mdb = state.mdb self.UID_validity = id self.nextUID = 1 @@ -826,8 +822,8 @@ msg = email.message_from_string(messageText, _class=message.SBHeaderMessage) # Now find the spam disposition and add the header. - (prob, clues) = state.bayes.spamprob(msg.tokenize(),\ - evidence=True) + (prob, clues) = state.bayes.spamprob(msg.tokenize(), + evidence=True) # Note that the X-SpamBayes-MailID header will be worthless # because we don't know the message id at this point. It's @@ -870,7 +866,7 @@ message.insert_exception_header(messageText) # Print the exception and a traceback. - print >>sys.stderr, details + print >> sys.stderr, details retval = ok + "\n" + messageText if terminatingDotPresent: retval += '.\r\n' @@ -1009,12 +1005,12 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'ho:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) Modified: trunk/spambayes/scripts/sb_server.py =================================================================== --- trunk/spambayes/scripts/sb_server.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_server.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -102,16 +102,15 @@ o NNTP proxy. """ -import os, sys, re, errno, getopt, time, traceback, socket, cStringIO, email +import sys, re, getopt, time, socket, email from thread import start_new_thread -from email.Header import Header import spambayes.message from spambayes import i18n from spambayes import Stats from spambayes import Dibbler from spambayes import storage -from spambayes.FileCorpus import FileCorpus, ExpiryFileCorpus +from spambayes.FileCorpus import ExpiryFileCorpus from spambayes.FileCorpus import FileMessageFactory, GzipFileMessageFactory from spambayes.Options import options, get_pathname_option, _ from spambayes.UserInterface import UserInterfaceServer @@ -191,7 +190,7 @@ except socket.sslerror, why: if why[0] == 1: # error:140770FC:SSL routines:SSL23_GET_SERVER_HELLO:unknown protocol' # Probably not SSL after all. - print >>sys.stderr, "Can't use SSL" + print >> sys.stderr, "Can't use SSL" else: raise else: @@ -367,8 +366,7 @@ raise SystemExit elif verb == 'CRASH': # For testing - x = 0 - y = 1/x + raise ZeroDivisionError self.serverSocket.push(self.request + '\r\n') if self.request.strip() == '': @@ -568,8 +566,8 @@ _class=spambayes.message.SBHeaderMessage) msg.setId(state.getNewMessageName()) # Now find the spam disposition and add the header. - (prob, clues) = state.bayes.spamprob(msg.tokenize(),\ - evidence=True) + (prob, clues) = state.bayes.spamprob(msg.tokenize(), + evidence=True) msg.addSBHeaders(prob, clues) @@ -632,7 +630,7 @@ insert_exception_header(messageText) # Print the exception and a traceback. - print >>sys.stderr, details + print >> sys.stderr, details # Restore the +OK and the POP3 .\r\n terminator if there was one. retval = ok + "\n" + messageText @@ -836,7 +834,6 @@ nham = self.bayes.nham if nspam > 10 and nham > 10: db_ratio = nham/float(nspam) - big = small = None if db_ratio > 5.0: self.warning = _("Warning: you have much more ham than " \ "spam - SpamBayes works best with " \ @@ -988,8 +985,6 @@ proxyListeners.append(listener) def _recreateState(): - global state - # Close the existing listeners and create new ones. This won't # affect any running proxies - once a listener has created a proxy, # that proxy is then independent of it. @@ -1057,13 +1052,12 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'hbd:p:l:u:o:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() - runSelfTest = False for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-b': state.launchUI = True @@ -1096,14 +1090,14 @@ try: prepare() except AlreadyRunningException: - print >>sys.stderr, \ + print >> sys.stderr, \ "ERROR: The proxy is already running on this machine." - print >>sys.stderr, "Please stop the existing proxy and try again" + print >> sys.stderr, "Please stop the existing proxy and try again" return start() else: - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ if __name__ == '__main__': run() Modified: trunk/spambayes/scripts/sb_upload.py =================================================================== --- trunk/spambayes/scripts/sb_upload.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_upload.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -58,7 +58,7 @@ h.putheader('content-length', str(len(body))) h.endheaders() h.send(body) - errcode, errmsg, headers = h.getreply() + h.getreply() return h.file.read() def encode_multipart_formdata(fields, files): @@ -153,7 +153,7 @@ ("text", "")], [("file", "message.dat", data)]) else: - post_multipart("%s:%d" % (server,port), "/upload", [], + post_multipart("%s:%d" % (server, port), "/upload", [], [('file', 'message.dat', data)]) except: # not an error if the server isn't responding Modified: trunk/spambayes/scripts/sb_xmlrpcserver.py =================================================================== --- trunk/spambayes/scripts/sb_xmlrpcserver.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/scripts/sb_xmlrpcserver.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -22,10 +22,8 @@ Port number to listen to. """ -import os import getopt import sys -import traceback import xmlrpclib import SimpleXMLRPCServer Modified: trunk/spambayes/setup.py =================================================================== --- trunk/spambayes/setup.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/setup.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -1,8 +1,10 @@ #!/usr/bin/env python import os +import sys -import sys +from setuptools import setup, find_packages + if sys.version < '2.2': print "Error: Python version too old. You need at least Python 2.2 to use this package." print "(you're running version %s)"%sys.version @@ -121,6 +123,9 @@ author = "the spambayes project", author_email = "spambayes at python.org", url = "http://spambayes.sourceforge.net", + install_requires = ["lockfile>=0.2", + "pydns>=2.0"], +## packages=find_packages("spambayes"), cmdclass = {'install_scripts': install_scripts, 'sdist': sdist, }, Modified: trunk/spambayes/spambayes/ImapUI.py =================================================================== --- trunk/spambayes/spambayes/ImapUI.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/spambayes/ImapUI.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -43,7 +43,7 @@ import cgi -import UserInterface +from spambayes import UserInterface from spambayes.Options import options, optionsPathname, _ # These are the options that will be offered on the configuration page. Modified: trunk/spambayes/spambayes/Options.py =================================================================== --- trunk/spambayes/spambayes/Options.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/spambayes/Options.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -33,7 +33,7 @@ __all__ = ['options', '_'] # Grab the stuff from the core options class. -from OptionsClass import * +from spambayes.OptionsClass import * # A little magic. We'd like to use ZODB as the default storage, # because we've had so many problems with bsddb, and we'd like to swap @@ -199,6 +199,12 @@ reasons if your corpora are from different sources."""), BOOLEAN, RESTORE), + ("x-mine_nntp_headers", _("Mine NNTP-Posting-Host headers"), False, + _("""Usenet is host to a lot of spam. Usenet/Mailing list gateways + can let it leak across. Similar to mining received headers, we pick + apart the IP address or host name in this header for clues."""), + BOOLEAN, RESTORE), + ("address_headers", _("Address headers to mine"), ("from", "to", "cc", "sender", "reply-to"), _("""Mine the following address headers. If you have mixed source @@ -603,8 +609,8 @@ ("persistent_use_database", _("Database backend"), DB_TYPE[0], _("""SpamBayes can use either a ZODB or dbm database (quick to score one message) or a pickle (quick to train on huge amounts of messages). - There is also (currently experimental) the ability to use a mySQL or - PostgrepSQL database."""), + There is also (experimental) ability to use a mySQL or PostgresSQL + database."""), ("zeo", "zodb", "cdb", "mysql", "pgsql", "dbm", "pickle"), RESTORE), ("persistent_storage_file", _("Storage file name"), DB_TYPE[1], @@ -1360,7 +1366,7 @@ # in the current directory, and no win32 extensions installed # to locate the "user" directory - seeing things are so lamely # setup, it is worth printing a warning - print >>sys.stderr, "NOTE: We can not locate an INI file " \ + print >> sys.stderr, "NOTE: We can not locate an INI file " \ "for SpamBayes, and the Python for Windows extensions " \ "are not installed, meaning we can't locate your " \ "'user' directory. An empty configuration file at " \ Modified: trunk/spambayes/spambayes/Version.py =================================================================== --- trunk/spambayes/spambayes/Version.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/spambayes/Version.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -10,8 +10,8 @@ will generate the "ConfigParser" version for the web. """ -import string, re -from types import StringType +import sys +import re try: _ @@ -22,8 +22,8 @@ # A reason for why the spambayes.org URL fails is given in a comment there. #LATEST_VERSION_HOME="http://www.spambayes.org/download/Version.cfg" # The SF URL instead works for Tim and xenogeist. -LATEST_VERSION_HOME="http://spambayes.sourceforge.net/download/Version.cfg" -DEFAULT_DOWNLOAD_PAGE="http://spambayes.sourceforge.net/windows.html" +LATEST_VERSION_HOME = "http://spambayes.sourceforge.net/download/Version.cfg" +DEFAULT_DOWNLOAD_PAGE = "http://spambayes.sourceforge.net/windows.html" # This module is part of the spambayes project, which is Copyright 2002-2007 # The Python Software Foundation and is covered by the Python Software @@ -66,7 +66,7 @@ # and massage it into a string format that will compare properly # in update checks. try: - ver_num = float(version) + float(version) # Version converted successfully to a float, which means it # may be an old-format version number. Old convention was to # use 1.01 to represent "1.0.1", so check to see if there is @@ -86,7 +86,8 @@ def get_download_page(app = None, version_dict = None): - if version_dict is None: version_dict = versions + if version_dict is None: + version_dict = versions dict = version_dict # default to top level dictionary if app is not None: # attempt to get a sub-dict for the specific app @@ -185,21 +186,21 @@ releaselevel = "final" serial = 0 else: - serial = string.atoi(prerelease_num) + serial = int(prerelease_num) if prerelease == "a": releaselevel = "alpha" elif prerelease == "b": releaselevel = "beta" elif prerelease == "rc": releaselevel = "candidate" - self.version_info = tuple(map(string.atoi, [major, minor, patch]) + \ + self.version_info = tuple(map(int, [major, minor, patch]) + \ [releaselevel, serial]) def __str__(self): if self.version_info[2] == 0: - vstring = string.join(map(str, self.version_info[0:2]), '.') + vstring = '.'.join(map(str, self.version_info[0:2])) else: - vstring = string.join(map(str, self.version_info[0:3]), '.') + vstring = '.'.join(map(str, self.version_info[0:3])) releaselevel = self.version_info[3][0] if releaselevel != 'f': @@ -214,13 +215,14 @@ return vstring def __cmp__(self, other): - if isinstance(other, StringType): + if isinstance(other, str): other = SBVersion(other) return cmp(self.version_info, other.version_info) def get_long_version(self, app_name = None): - if app_name is None: app_name = "SpamBayes" + if app_name is None: + app_name = "SpamBayes" return _("%s Version %s (%s)") % (app_name, str(self), self.date) #============================================================================ @@ -268,7 +270,7 @@ ret_dict = {} apps_dict = ret_dict["Apps"] = {} for sect in cfg.sections(): - if sect=="SpamBayes": + if sect == "SpamBayes": target_dict = ret_dict else: target_dict = apps_dict.setdefault(sect, {}) @@ -348,7 +350,6 @@ _make_compatible_cfg_section(stream, appname, ver, versions["Apps"][appname]) def main(args): - import sys if '-g' in args: make_cfg(sys.stdout) sys.exit(0) @@ -370,6 +371,5 @@ print print "Latest version:", v_latest.get_long_version() -if __name__=='__main__': - import sys +if __name__ == '__main__': main(sys.argv) Modified: trunk/spambayes/spambayes/__init__.py =================================================================== --- trunk/spambayes/spambayes/__init__.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/spambayes/__init__.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -1,4 +1,4 @@ # package marker. -__version__ = "1.1a4" -__date__ = "June 25, 2007" +__version__ = "1.1b1" +__date__ = "November 23, 2008" Modified: trunk/spambayes/spambayes/chi2.py =================================================================== --- trunk/spambayes/spambayes/chi2.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/spambayes/chi2.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -1,4 +1,5 @@ import math as _math +import random try: True, False @@ -106,7 +107,7 @@ def __init__(self, baserandom=random.random, tabsize=513): self.baserandom = baserandom self.n = tabsize - self.tab = [baserandom() for i in range(tabsize)] + self.tab = [baserandom() for _i in range(tabsize)] self.next = baserandom() def random(self): @@ -150,8 +151,8 @@ s = Hist(20, lo=0.0, hi=1.0) score = Hist(20, lo=0.0, hi=1.0) - for i in range(5000): - ps = [random() for j in range(50)] + for _i in xrange(5000): + ps = [random() for _j in xrange(50)] s1, h1, score1 = judge(ps + [bias] * warp) s.add(s1) h.add(h1) @@ -203,5 +204,4 @@ print "(S-H+1)/2", score if __name__ == '__main__': - import random main() Modified: trunk/spambayes/spambayes/optimize.py =================================================================== --- trunk/spambayes/spambayes/optimize.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/spambayes/optimize.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -4,6 +4,8 @@ # Optimize any parametric function. # import copy + +# XXX Numeric is obsolete. Replace with numpy. import Numeric def SimplexMaximize(var, err, func, convcrit = 0.001, minerr = 0.001): @@ -30,7 +32,7 @@ if abs(value[bi] - value[wi]) <= convcrit: return simplex[bi] # Calculate average of non-worst - ave=Numeric.zeros(len(var), 'd') + ave = Numeric.zeros(len(var), 'd') for i in range(len(simplex)): if i != wi: ave = ave + simplex[i] Modified: trunk/spambayes/spambayes/storage.py =================================================================== --- trunk/spambayes/spambayes/storage.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/spambayes/storage.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -50,8 +50,8 @@ ### situations prints to sys.stdout will garble the message (e.g., in ### hammiefilter). -__author__ = "Neale Pickett , \ -Tim Stone " +__author__ = ("Neale Pickett ," + "Tim Stone ") __credits__ = "All the spambayes contributors." try: @@ -69,11 +69,11 @@ import tempfile from spambayes import classifier from spambayes.Options import options, get_pathname_option -import cPickle as pickle import errno import shelve from spambayes import cdb from spambayes import dbmstorage +from spambayes.safepickle import pickle_write # Make shelve use binary pickles by default. oldShelvePickler = shelve.Pickler @@ -85,36 +85,6 @@ NO_UPDATEPROBS = False # Probabilities will not be autoupdated with training UPDATEPROBS = True # Probabilities will be autoupdated with training -def safe_pickle(filename, value, protocol=0): - '''Store value as a pickle without creating corruption''' - - # Be as defensive as possible. Always keep a safe copy. - tmp = filename + '.tmp' - fp = None - try: - fp = open(tmp, 'wb') - pickle.dump(value, fp, protocol) - fp.close() - except IOError, e: - if options["globals", "verbose"]: - print >> sys.stderr, 'Failed update: ' + str(e) - if fp is not None: - os.remove(tmp) - raise - try: - # With *nix we can just rename, and (as long as permissions - # are correct) the old file will vanish. With win32, this - # won't work - the Python help says that there may not be - # a way to do an atomic replace, so we rename the old one, - # put the new one there, and then delete the old one. If - # something goes wrong, there is at least a copy of the old - # one. - os.rename(tmp, filename) - except OSError: - os.rename(filename, filename + '.bak') - os.rename(tmp, filename) - os.remove(filename + '.bak') - class PickledClassifier(classifier.Classifier): '''Classifier object persisted in a pickle''' @@ -136,16 +106,12 @@ # tempbayes object is reclaimed when load() returns. if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from',self.db_name,'pickle' + print >> sys.stderr, 'Loading state from', self.db_name, 'pickle' - tempbayes = None try: - fp = open(self.db_name, 'rb') - except IOError, e: - if e.errno != errno.ENOENT: raise - else: - tempbayes = pickle.load(fp) - fp.close() + tempbayes = pickle_read(self.db_name) + except: + tempbayes = None if tempbayes: # Copy state from tempbayes. The use of our base-class @@ -169,9 +135,9 @@ '''Store self as a pickle''' if options["globals", "verbose"]: - print >> sys.stderr, 'Persisting',self.db_name,'as a pickle' + print >> sys.stderr, 'Persisting', self.db_name, 'as a pickle' - safe_pickle(self.db_name, self, PICKLE_TYPE) + pickle_write(self.db_name, self, PICKLE_TYPE) def close(self): # we keep no resources open - nothing to do @@ -198,7 +164,8 @@ def close(self): # Close our underlying database. Better not assume all databases # have close functions! - def noop(): pass + def noop(): + pass getattr(self.db, "close", noop)() getattr(self.dbm, "close", noop)() # should not be a need to drop the 'dbm' or 'db' attributes. @@ -210,13 +177,13 @@ if hasattr(self, "dbm"): del self.dbm if options["globals", "verbose"]: - print >> sys.stderr, 'Closed',self.db_name,'database' + print >> sys.stderr, 'Closed', self.db_name, 'database' def load(self): '''Load state from database''' if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from',self.db_name,'database' + print >> sys.stderr, 'Loading state from', self.db_name, 'database' self.dbm = dbmstorage.open(self.db_name, self.mode) self.db = shelve.Shelf(self.dbm) @@ -244,7 +211,8 @@ '''Place state into persistent store''' if options["globals", "verbose"]: - print >> sys.stderr, 'Persisting',self.db_name,'state in database' + print >> sys.stderr, 'Persisting', self.db_name, + print >> sys.stderr, 'state in database' # Iterate over our changed word list. # This is *not* thread-safe - another thread changing our @@ -471,7 +439,7 @@ def fetchall(self, c): return c.dictfetchall() - def commit(self, c): + def commit(self, _c): self.db.commit() def load(self): @@ -480,7 +448,7 @@ import psycopg if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from',self.db_name,'database' + print >> sys.stderr, 'Loading state from', self.db_name, 'database' self.db = psycopg.connect('dbname=' + self.db_name) @@ -545,7 +513,7 @@ def fetchall(self, c): return c.fetchall() - def commit(self, c): + def commit(self, _c): self.db.commit() def load(self): @@ -554,7 +522,7 @@ import MySQLdb if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from',self.db_name,'database' + print >> sys.stderr, 'Loading state from', self.db_name, 'database' params = { 'host': self.host, 'db': self.db_name, @@ -724,12 +692,11 @@ object.__setattr__(self, att, value) def create_storage(self): - import ZODB from ZODB.FileStorage import FileStorage try: self.storage = FileStorage(self.db_filename, read_only=self.mode=='r') - except IOError, msg: + except IOError: print >> sys.stderr, ("Could not create FileStorage from", self.db_filename) raise @@ -768,7 +735,6 @@ def store(self): '''Place state into persistent store''' try: - import ZODB import ZODB.Transaction except ImportError: import transaction @@ -971,7 +937,7 @@ '''Untrain the database with the message''' if options["globals", "verbose"]: - print >> sys.stderr, 'untraining with',message.key() + print >> sys.stderr, 'untraining with', message.key() self.bayes.unlearn(message.tokenize(), self.is_spam) # self.updateprobs) @@ -1005,6 +971,7 @@ class NoSuchClassifierError(Exception): def __init__(self, invalid_name): + Exception.__init__(self, invalid_name) self.invalid_name = invalid_name def __str__(self): return repr(self.invalid_name) @@ -1088,7 +1055,7 @@ try: unused, unused, is_path = _storage_types[typ] except KeyError: - raise NoSuchClassifierError(db_type) + raise NoSuchClassifierError(typ) if is_path: nm = get_pathname_option(*default_name) else: @@ -1142,7 +1109,7 @@ try: os.mkdir(dirname) if options["globals", "verbose"]: - print >>sys.stderr, "Creating directory", dirname + print >> sys.stderr, "Creating directory", dirname except OSError, e: if e.errno != errno.EEXIST: raise Modified: trunk/spambayes/spambayes/tokenizer.py =================================================================== --- trunk/spambayes/spambayes/tokenizer.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/spambayes/tokenizer.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -14,6 +14,8 @@ import binascii import urlparse import urllib +import socket + try: # We have three possibilities for Set: # (a) With Python 2.2 and earlier, we use our compatsets class @@ -39,7 +41,7 @@ try: - import dnscache + from spambayes import dnscache cache = dnscache.cache(cachefile=options["Tokenizer", "lookup_ip_cache"]) cache.printStatsAtEnd = False except (IOError, ImportError): @@ -681,6 +683,8 @@ # by m19.grp.scd.yahoo.com with QMQP; 19 Dec 2003 04:06:53 -0000 received_ip_re = re.compile(r'[[(]((\d{1,3}\.?){4})[])]') +received_nntp_ip_re = re.compile(r'((\d{1,3}\.?){4})') + message_id_re = re.compile(r'\s*<[^@]+@([^>]+)>\s*') # I'm usually just splitting on whitespace, but for subject lines I want to @@ -1084,19 +1088,12 @@ scheme, netloc, path, params, query, frag = urlparse.urlparse(url) if cache is not None and options["Tokenizer", "x-lookup_ip"]: - ips=cache.lookup(netloc) + ips = cache.lookup(netloc) if not ips: pushclue("url-ip:lookup error") else: - for ip in ips: # Should we limit to one A record? - pushclue("url-ip:%s/32" % ip) - dottedQuadList=ip.split(".") - pushclue("url-ip:%s/8" % dottedQuadList[0]) - pushclue("url-ip:%s.%s/16" % (dottedQuadList[0], - dottedQuadList[1])) - pushclue("url-ip:%s.%s.%s/24" % (dottedQuadList[0], - dottedQuadList[1], - dottedQuadList[2])) + for clue in gen_dotted_quad_clues("url-ip", ips): + pushclue(clue) # one common technique in bogus "please (re-)authorize yourself" # scams is to make it appear as if you're visiting a valid @@ -1526,6 +1523,13 @@ for tok in breakdown(m.group(1)): yield 'received:' + tok + # Lots of spam gets posted on Usenet. If it is then gatewayed to a + # mailing list perhaps the NNTP-Posting-Host info will yield some + # useful clues. + if options["Tokenizer", "x-mine_nntp_headers"]: + for clue in mine_nntp(msg): + yield clue + # Message-Id: This seems to be a small win and should not # adversely affect a mixed source corpus so it's always enabled. msgid = msg.get("message-id", "") @@ -1698,5 +1702,56 @@ for t in self.tokenize_text(text): yield t +def mine_nntp(msg): + nntp_headers = msg.get_all("nntp-posting-host", ()) + yield "has-nntp:%s" % not not nntp_headers + for header in nntp_headers: + try: + address = header.split()[1] + except IndexError: + continue + if received_nntp_ip_re.match(address): + for clue in gen_dotted_quad_clues("nntp-host", [address]): + yield clue + try: + h = socket.gethostbyaddr(address) + except socket.herror: + yield 'nntp-host-ip:has-no-reverse' + else: + yield 'nntp-host-ip:has-reverse' + yield 'nntp-host-name:%s' % h[0] + yield ('nntp-host-domain:%s' % + '.'.join(h[0].split('.')[-2:])) + else: + # assume it's a hostname + name = address + yield 'nntp-host-name:%s' % name + yield ('nntp-host-domain:%s' % + '.'.join(name.split('.')[-2:])) + try: + address = socket.gethostbyname(name) + except socket.gaierror: + yield 'nntp-host-name:invalid' + else: + for clue in gen_dotted_quad_clues("nntp-host-ip", [address]): + yield clue + try: + h = socket.gethostbyaddr(address) + except socket.herror: + yield 'nntp-host-ip:has-no-reverse' + else: + yield 'nntp-host-ip:has-reverse' + +def gen_dotted_quad_clues(pfx, ips): + for ip in ips: + yield "%s:%s/32" % (pfx, ip) + dottedQuadList = ip.split(".") + yield "%s:%s/8" % (pfx, dottedQuadList[0]) + yield "%s:%s.%s/16" % (pfx, dottedQuadList[0], + dottedQuadList[1]) + yield "%s:%s.%s.%s/24" % (pfx, dottedQuadList[0], + dottedQuadList[1], + dottedQuadList[2]) + global_tokenizer = Tokenizer() tokenize = global_tokenizer.tokenize Modified: trunk/spambayes/testtools/es2hs.py =================================================================== --- trunk/spambayes/testtools/es2hs.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/testtools/es2hs.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -43,8 +43,6 @@ def main(): """Main program; parse options and go.""" - global loud - everything = None spam = [] @@ -71,7 +69,8 @@ spamsizes = {} for s in spam: - if loud: print "Scanning spamdir (%s):" % s + if loud: + print "Scanning spamdir (%s):" % s files = os.listdir(s) for f in files: if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): @@ -85,7 +84,8 @@ os.makedirs(spamdir) os.makedirs(hamdir) - if loud: print "Scanning everything" + if loud: + print "Scanning everything" for f in os.listdir(everything): if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): name = os.path.join(everything, f) Modified: trunk/spambayes/testtools/incremental.py =================================================================== --- trunk/spambayes/testtools/incremental.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/testtools/incremental.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -30,7 +30,7 @@ from spambayes import msgs import email from email import Message -import regimes +from testtools import regimes try: True, False @@ -76,10 +76,10 @@ # The number of test instances correctly and incorrectly classified. self.nham_right = 0 self.nham_wrong = 0 - self.nham_unsure = 0; + self.nham_unsure = 0 self.nspam_right = 0 self.nspam_wrong = 0 - self.nspam_unsure = 0; + self.nspam_unsure = 0 # Lists of bad predictions. self.ham_wrong_examples = [] # False positives: ham called spam. Modified: trunk/spambayes/utilities/HistToGNU.py =================================================================== --- trunk/spambayes/utilities/HistToGNU.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/HistToGNU.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -18,14 +18,13 @@ set xrange [0.0:100.0] """ -dataSetOptions="smooth unique" +dataSetOptions = "smooth unique" -from spambayes.Options import options -from spambayes.TestDriver import Hist - import sys -import cPickle as pickle +from spambayes.Options import options +from spambayes.safepickle import pickle_read, pickle_write + program = sys.argv[0] def usage(code, msg=''): @@ -38,7 +37,7 @@ def loadHist(path): """Load the histogram pickle object""" - return pickle.load(file(path)) + return pickle_read(path) def outputHist(hist, f=sys.stdout): """Output the Hist object to file f""" @@ -49,7 +48,7 @@ def plot(files): """given a list of files, create gnu-plot file""" - import cStringIO, os + import cStringIO cmd = cStringIO.StringIO() cmd.write(globalOptions) args = [] Modified: trunk/spambayes/utilities/convert_config_file.py =================================================================== --- trunk/spambayes/utilities/convert_config_file.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/convert_config_file.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -45,7 +45,7 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'vhf:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() filename = "bayescustomize.ini" @@ -53,7 +53,7 @@ for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-f': filename = arg @@ -64,7 +64,6 @@ if verbose: print "Loading defaults" o.load_defaults() - alts = [] if verbose: print "Updating file:", filename if os.path.exists(filename): Modified: trunk/spambayes/utilities/convert_db.py =================================================================== --- trunk/spambayes/utilities/convert_db.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/convert_db.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -41,13 +41,13 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'ht:T:n:N:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() old_name = old_type = new_name = new_type = None for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-t': old_type = arg Modified: trunk/spambayes/utilities/extractmessages.py =================================================================== --- trunk/spambayes/utilities/extractmessages.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/extractmessages.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -23,11 +23,11 @@ import sys import getopt import re -import cPickle as pickle import locale from email.Header import make_header, decode_header from spambayes.mboxutils import getmbox +from spambayes.safepickle import pickle_read, pickle_write prog = sys.argv[0] @@ -115,7 +115,7 @@ return 1 try: - mapd = pickle.load(file(mapfile)) + mapd = pickle_read(mapfile) except IOError: usage("Mapfile %s does not exist" % mapfile) return 1 Modified: trunk/spambayes/utilities/hammer.py =================================================================== --- trunk/spambayes/utilities/hammer.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/hammer.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -3,7 +3,7 @@ # Part of the SpamBayes project. Released under the Python Software # Foundation license; see http://www.python.org/ -import os, sys, re, random, textwrap +import os, re, random, textwrap from spambayes import storage from spambayes import tokenizer Modified: trunk/spambayes/utilities/loosecksum.py =================================================================== --- trunk/spambayes/utilities/loosecksum.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/loosecksum.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -26,10 +26,8 @@ import getopt import sys -import email.Parser import md5 import re -import time import binascii from spambayes.mboxutils import getmbox Modified: trunk/spambayes/utilities/mboxcount.py =================================================================== --- trunk/spambayes/utilities/mboxcount.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/mboxcount.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -29,7 +29,6 @@ import sys import mailbox -import email import getopt import glob Modified: trunk/spambayes/utilities/mkreversemap.py =================================================================== --- trunk/spambayes/utilities/mkreversemap.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/mkreversemap.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -17,13 +17,12 @@ import sys import getopt -import anydbm -import cPickle as pickle from spambayes.mboxutils import getmbox from spambayes.tokenizer import tokenize from spambayes.Options import options from spambayes.classifier import Classifier +from spambayes.safepickle import pickle_read, pickle_write prog = sys.argv[0] @@ -99,13 +98,13 @@ return 1 try: - mapd = pickle.load(file(mapfile)) + mapd = pickle_read(mapfile) except IOError: mapd = {} for f in args: mapmessages(f, mboxtype, mapd) - pickle.dump(mapd, file(mapfile, "w")) + pickle_write(mapfile, mapd) if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Modified: trunk/spambayes/utilities/split.py =================================================================== --- trunk/spambayes/utilities/split.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/split.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -28,7 +28,6 @@ import sys import random import mailbox -import email import getopt from spambayes import mboxutils Modified: trunk/spambayes/utilities/splitn.py =================================================================== --- trunk/spambayes/utilities/splitn.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/splitn.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -42,7 +42,6 @@ import sys import random import mailbox -import email import getopt from spambayes import mboxutils Modified: trunk/spambayes/utilities/splitndirs.py =================================================================== --- trunk/spambayes/utilities/splitndirs.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/utilities/splitndirs.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -47,8 +47,6 @@ import sys import os import random -import mailbox -import email import getopt import glob Modified: trunk/spambayes/windows/autoconfigure.py =================================================================== --- trunk/spambayes/windows/autoconfigure.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/windows/autoconfigure.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -496,7 +496,6 @@ results = [] for filename in os.listdir(config_location): if filename.lower().startswith("pop") or filename.lower().startswith("smt"): - full_filename = os.path.join(config_location, filename) working_filename = "%s.tmp" % (filename, ) shutil.copyfile(filename, working_filename) c = OptionsClass.OptionsClass() Modified: trunk/spambayes/windows/pop3proxy_service.py =================================================================== --- trunk/spambayes/windows/pop3proxy_service.py 2008-11-25 02:08:00 UTC (rev 3203) +++ trunk/spambayes/windows/pop3proxy_service.py 2008-11-25 02:10:28 UTC (rev 3204) @@ -138,7 +138,7 @@ try: # module imported by service manager, or 2.3 (in which __main__ # exists, *and* sys.argv[0] is always already absolute) - this_filename=__file__ + this_filename = __file__ except NameError: this_filename = sys.argv[0] if not os.path.isabs(sys.argv[0]): This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 04:37:57 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 03:37:57 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3207] trunk/spambayes/utilities Message-ID: Revision: 3207 http://spambayes.svn.sourceforge.net/spambayes/?rev=3207&view=rev Author: montanaro Date: 2008-11-25 03:37:57 +0000 (Tue, 25 Nov 2008) Log Message: ----------- use safepickle functions, pylint nits Modified Paths: -------------- trunk/spambayes/utilities/HistToGNU.py trunk/spambayes/utilities/convert_config_file.py trunk/spambayes/utilities/convert_db.py trunk/spambayes/utilities/extractmessages.py trunk/spambayes/utilities/hammer.py trunk/spambayes/utilities/loosecksum.py trunk/spambayes/utilities/mboxcount.py trunk/spambayes/utilities/mkreversemap.py trunk/spambayes/utilities/split.py trunk/spambayes/utilities/splitn.py trunk/spambayes/utilities/splitndirs.py Modified: trunk/spambayes/utilities/HistToGNU.py =================================================================== --- trunk/spambayes/utilities/HistToGNU.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/HistToGNU.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -18,14 +18,13 @@ set xrange [0.0:100.0] """ -dataSetOptions="smooth unique" +dataSetOptions = "smooth unique" -from spambayes.Options import options -from spambayes.TestDriver import Hist - import sys -import cPickle as pickle +from spambayes.Options import options +from spambayes.safepickle import pickle_read, pickle_write + program = sys.argv[0] def usage(code, msg=''): @@ -38,7 +37,7 @@ def loadHist(path): """Load the histogram pickle object""" - return pickle.load(file(path)) + return pickle_read(path) def outputHist(hist, f=sys.stdout): """Output the Hist object to file f""" @@ -49,7 +48,7 @@ def plot(files): """given a list of files, create gnu-plot file""" - import cStringIO, os + import cStringIO cmd = cStringIO.StringIO() cmd.write(globalOptions) args = [] Modified: trunk/spambayes/utilities/convert_config_file.py =================================================================== --- trunk/spambayes/utilities/convert_config_file.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/convert_config_file.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -45,7 +45,7 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'vhf:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() filename = "bayescustomize.ini" @@ -53,7 +53,7 @@ for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-f': filename = arg @@ -64,7 +64,6 @@ if verbose: print "Loading defaults" o.load_defaults() - alts = [] if verbose: print "Updating file:", filename if os.path.exists(filename): Modified: trunk/spambayes/utilities/convert_db.py =================================================================== --- trunk/spambayes/utilities/convert_db.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/convert_db.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -41,13 +41,13 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'ht:T:n:N:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() old_name = old_type = new_name = new_type = None for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-t': old_type = arg Modified: trunk/spambayes/utilities/extractmessages.py =================================================================== --- trunk/spambayes/utilities/extractmessages.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/extractmessages.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -23,11 +23,11 @@ import sys import getopt import re -import cPickle as pickle import locale from email.Header import make_header, decode_header from spambayes.mboxutils import getmbox +from spambayes.safepickle import pickle_read, pickle_write prog = sys.argv[0] @@ -115,7 +115,7 @@ return 1 try: - mapd = pickle.load(file(mapfile)) + mapd = pickle_read(mapfile) except IOError: usage("Mapfile %s does not exist" % mapfile) return 1 Modified: trunk/spambayes/utilities/hammer.py =================================================================== --- trunk/spambayes/utilities/hammer.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/hammer.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -3,7 +3,7 @@ # Part of the SpamBayes project. Released under the Python Software # Foundation license; see http://www.python.org/ -import os, sys, re, random, textwrap +import os, re, random, textwrap from spambayes import storage from spambayes import tokenizer Modified: trunk/spambayes/utilities/loosecksum.py =================================================================== --- trunk/spambayes/utilities/loosecksum.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/loosecksum.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -26,10 +26,8 @@ import getopt import sys -import email.Parser import md5 import re -import time import binascii from spambayes.mboxutils import getmbox Modified: trunk/spambayes/utilities/mboxcount.py =================================================================== --- trunk/spambayes/utilities/mboxcount.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/mboxcount.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -29,7 +29,6 @@ import sys import mailbox -import email import getopt import glob Modified: trunk/spambayes/utilities/mkreversemap.py =================================================================== --- trunk/spambayes/utilities/mkreversemap.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/mkreversemap.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -17,13 +17,12 @@ import sys import getopt -import anydbm -import cPickle as pickle from spambayes.mboxutils import getmbox from spambayes.tokenizer import tokenize from spambayes.Options import options from spambayes.classifier import Classifier +from spambayes.safepickle import pickle_read, pickle_write prog = sys.argv[0] @@ -99,13 +98,13 @@ return 1 try: - mapd = pickle.load(file(mapfile)) + mapd = pickle_read(mapfile) except IOError: mapd = {} for f in args: mapmessages(f, mboxtype, mapd) - pickle.dump(mapd, file(mapfile, "w")) + pickle_write(mapfile, mapd) if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Modified: trunk/spambayes/utilities/split.py =================================================================== --- trunk/spambayes/utilities/split.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/split.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -28,7 +28,6 @@ import sys import random import mailbox -import email import getopt from spambayes import mboxutils Modified: trunk/spambayes/utilities/splitn.py =================================================================== --- trunk/spambayes/utilities/splitn.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/splitn.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -42,7 +42,6 @@ import sys import random import mailbox -import email import getopt from spambayes import mboxutils Modified: trunk/spambayes/utilities/splitndirs.py =================================================================== --- trunk/spambayes/utilities/splitndirs.py 2008-11-25 03:29:21 UTC (rev 3206) +++ trunk/spambayes/utilities/splitndirs.py 2008-11-25 03:37:57 UTC (rev 3207) @@ -47,8 +47,6 @@ import sys import os import random -import mailbox -import email import getopt import glob This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 04:39:02 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 03:39:02 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3208] trunk/spambayes/testtools Message-ID: Revision: 3208 http://spambayes.svn.sourceforge.net/spambayes/?rev=3208&view=rev Author: montanaro Date: 2008-11-25 03:39:02 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/testtools/es2hs.py trunk/spambayes/testtools/incremental.py Modified: trunk/spambayes/testtools/es2hs.py =================================================================== --- trunk/spambayes/testtools/es2hs.py 2008-11-25 03:37:57 UTC (rev 3207) +++ trunk/spambayes/testtools/es2hs.py 2008-11-25 03:39:02 UTC (rev 3208) @@ -43,8 +43,6 @@ def main(): """Main program; parse options and go.""" - global loud - everything = None spam = [] @@ -71,7 +69,8 @@ spamsizes = {} for s in spam: - if loud: print "Scanning spamdir (%s):" % s + if loud: + print "Scanning spamdir (%s):" % s files = os.listdir(s) for f in files: if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): @@ -85,7 +84,8 @@ os.makedirs(spamdir) os.makedirs(hamdir) - if loud: print "Scanning everything" + if loud: + print "Scanning everything" for f in os.listdir(everything): if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): name = os.path.join(everything, f) Modified: trunk/spambayes/testtools/incremental.py =================================================================== --- trunk/spambayes/testtools/incremental.py 2008-11-25 03:37:57 UTC (rev 3207) +++ trunk/spambayes/testtools/incremental.py 2008-11-25 03:39:02 UTC (rev 3208) @@ -30,7 +30,7 @@ from spambayes import msgs import email from email import Message -import regimes +from testtools import regimes try: True, False @@ -76,10 +76,10 @@ # The number of test instances correctly and incorrectly classified. self.nham_right = 0 self.nham_wrong = 0 - self.nham_unsure = 0; + self.nham_unsure = 0 self.nspam_right = 0 self.nspam_wrong = 0 - self.nspam_unsure = 0; + self.nspam_unsure = 0 # Lists of bad predictions. self.ham_wrong_examples = [] # False positives: ham called spam. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 04:39:36 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 03:39:36 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3209] trunk/spambayes/windows Message-ID: Revision: 3209 http://spambayes.svn.sourceforge.net/spambayes/?rev=3209&view=rev Author: montanaro Date: 2008-11-25 03:39:36 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/windows/autoconfigure.py trunk/spambayes/windows/pop3proxy_service.py Modified: trunk/spambayes/windows/autoconfigure.py =================================================================== --- trunk/spambayes/windows/autoconfigure.py 2008-11-25 03:39:02 UTC (rev 3208) +++ trunk/spambayes/windows/autoconfigure.py 2008-11-25 03:39:36 UTC (rev 3209) @@ -496,7 +496,6 @@ results = [] for filename in os.listdir(config_location): if filename.lower().startswith("pop") or filename.lower().startswith("smt"): - full_filename = os.path.join(config_location, filename) working_filename = "%s.tmp" % (filename, ) shutil.copyfile(filename, working_filename) c = OptionsClass.OptionsClass() Modified: trunk/spambayes/windows/pop3proxy_service.py =================================================================== --- trunk/spambayes/windows/pop3proxy_service.py 2008-11-25 03:39:02 UTC (rev 3208) +++ trunk/spambayes/windows/pop3proxy_service.py 2008-11-25 03:39:36 UTC (rev 3209) @@ -138,7 +138,7 @@ try: # module imported by service manager, or 2.3 (in which __main__ # exists, *and* sys.argv[0] is always already absolute) - this_filename=__file__ + this_filename = __file__ except NameError: this_filename = sys.argv[0] if not os.path.isabs(sys.argv[0]): This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 04:40:34 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 03:40:34 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3210] trunk/spambayes/pspam Message-ID: Revision: 3210 http://spambayes.svn.sourceforge.net/spambayes/?rev=3210&view=rev Author: montanaro Date: 2008-11-25 03:40:34 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/pspam/pop.py trunk/spambayes/pspam/scoremsg.py trunk/spambayes/pspam/update.py Modified: trunk/spambayes/pspam/pop.py =================================================================== --- trunk/spambayes/pspam/pop.py 2008-11-25 03:39:36 UTC (rev 3209) +++ trunk/spambayes/pspam/pop.py 2008-11-25 03:40:34 UTC (rev 3210) @@ -27,7 +27,6 @@ """ import SocketServer -import asyncore try: import cStringIO as StringIO except ImportError: @@ -37,11 +36,8 @@ import re import socket import sys -import threading import time -import ZODB -from ZEO.ClientStorage import ClientStorage import zLOG from spambayes.tokenizer import tokenize Modified: trunk/spambayes/pspam/scoremsg.py =================================================================== --- trunk/spambayes/pspam/scoremsg.py 2008-11-25 03:39:36 UTC (rev 3209) +++ trunk/spambayes/pspam/scoremsg.py 2008-11-25 03:40:34 UTC (rev 3210) @@ -6,11 +6,7 @@ import locale from types import UnicodeType -import ZODB -from ZEO.ClientStorage import ClientStorage - import pspam.database -from spambayes.Options import options from spambayes.tokenizer import tokenize try: Modified: trunk/spambayes/pspam/update.py =================================================================== --- trunk/spambayes/pspam/update.py 2008-11-25 03:39:36 UTC (rev 3209) +++ trunk/spambayes/pspam/update.py 2008-11-25 03:40:34 UTC (rev 3210) @@ -2,9 +2,6 @@ import os import sys -import ZODB -from ZEO.ClientStorage import ClientStorage - import pspam.database from pspam.profile import Profile This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 04:45:40 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 03:45:40 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3211] trunk/spambayes/utilities/loosecksum.py Message-ID: Revision: 3211 http://spambayes.svn.sourceforge.net/spambayes/?rev=3211&view=rev Author: montanaro Date: 2008-11-25 03:45:39 +0000 (Tue, 25 Nov 2008) Log Message: ----------- missed hashlib change Modified Paths: -------------- trunk/spambayes/utilities/loosecksum.py Modified: trunk/spambayes/utilities/loosecksum.py =================================================================== --- trunk/spambayes/utilities/loosecksum.py 2008-11-25 03:40:34 UTC (rev 3210) +++ trunk/spambayes/utilities/loosecksum.py 2008-11-25 03:45:39 UTC (rev 3211) @@ -26,7 +26,10 @@ import getopt import sys -import md5 +try: + from hashlib import md5 +except ImportError: + from md5 import new as md5 import re import binascii @@ -77,7 +80,7 @@ sum = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) - sum.append(binascii.b2a_hex(md5.new(chunk).digest())) + sum.append(binascii.b2a_hex(md5(chunk).digest())) return ".".join(sum) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 04:50:08 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 03:50:08 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3212] trunk/spambayes/contrib Message-ID: Revision: 3212 http://spambayes.svn.sourceforge.net/spambayes/?rev=3212&view=rev Author: montanaro Date: 2008-11-25 03:50:08 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits, use safepickle functions Modified Paths: -------------- trunk/spambayes/contrib/SmarterHTTPServer.py trunk/spambayes/contrib/bulkgraph.py trunk/spambayes/contrib/findbest.py trunk/spambayes/contrib/mod_spambayes.py trunk/spambayes/contrib/nway.py trunk/spambayes/contrib/pycksum.py trunk/spambayes/contrib/sb_culler.py trunk/spambayes/contrib/spamcounts.py trunk/spambayes/contrib/tte.py Modified: trunk/spambayes/contrib/SmarterHTTPServer.py =================================================================== --- trunk/spambayes/contrib/SmarterHTTPServer.py 2008-11-25 03:45:39 UTC (rev 3211) +++ trunk/spambayes/contrib/SmarterHTTPServer.py 2008-11-25 03:50:08 UTC (rev 3212) @@ -20,7 +20,6 @@ import SimpleHTTPServer import urllib import cgi -import shutil import mimetypes import re try: Modified: trunk/spambayes/contrib/bulkgraph.py =================================================================== --- trunk/spambayes/contrib/bulkgraph.py 2008-11-25 03:45:39 UTC (rev 3211) +++ trunk/spambayes/contrib/bulkgraph.py 2008-11-25 03:50:08 UTC (rev 3212) @@ -32,15 +32,15 @@ quiet mode; no output """ -import mboxutils import getopt -import hammie import sys import os import re import time import filecmp +from spambayes import mboxutils, hammie + program = sys.argv[0] loud = True day = 24 * 60 * 60 Modified: trunk/spambayes/contrib/findbest.py =================================================================== --- trunk/spambayes/contrib/findbest.py 2008-11-25 03:45:39 UTC (rev 3211) +++ trunk/spambayes/contrib/findbest.py 2008-11-25 03:50:08 UTC (rev 3212) @@ -66,7 +66,6 @@ import sys import os -import cPickle as pickle import getopt import math @@ -75,6 +74,8 @@ from spambayes.hammie import Hammie from spambayes.tokenizer import tokenize from spambayes.Options import options +from spambayes import storage +from spambayes.safepickle import pickle_read, pickle_write cls = Classifier() h = Hammie(cls) @@ -98,7 +99,6 @@ def score(unsure, h, cls, scores, msgids=None, skipspam=False): """See what effect on others each msg in unsure has""" - ham_cutoff = options["Categorization", "ham_cutoff"] spam_cutoff = options["Categorization", "spam_cutoff"] # compute a base - number of messages in unsure already in the @@ -223,7 +223,7 @@ print "scoring" if best: - last_scores = pickle.load(file(bestfile)) + last_scores = pickle_read(bestfile) last_scores = last_scores.items() last_scores.sort() msgids = set() @@ -240,7 +240,7 @@ pass if not best: - pickle.dump(scores, file(bestfile, 'w')) + pickle_write(bestfile, scores) return 0 Modified: trunk/spambayes/contrib/mod_spambayes.py =================================================================== --- trunk/spambayes/contrib/mod_spambayes.py 2008-11-25 03:45:39 UTC (rev 3211) +++ trunk/spambayes/contrib/mod_spambayes.py 2008-11-25 03:50:08 UTC (rev 3212) @@ -5,26 +5,24 @@ ## Author: Skip Montanaro ## -import os - from proxy3_filter import * import proxy3_options -from spambayes import hammie, Options, mboxutils +from spambayes import hammie, Options dbf = Options.get_pathname_option("Storage", "persistent_storage_file") class SpambayesFilter(BufferAllFilter): - hammie = hammie.open(dbf, 1, 'r') + checker = hammie.open(dbf, 1, 'r') def filter(self, s): if self.reply.split()[1] == '200': - prob = self.hammie.score("%s\r\n%s" % (self.serverheaders, s)) + prob = self.checker.score("%s\r\n%s" % (self.serverheaders, s)) print "| prob: %.5f" % prob if prob >= Options.options["Categorization", "spam_cutoff"]: print self.serverheaders print "text:", s[0:40], "...", s[-40:] return "not authorized" - return s + return s from proxy3_util import * Modified: trunk/spambayes/contrib/nway.py =================================================================== --- trunk/spambayes/contrib/nway.py 2008-11-25 03:45:39 UTC (rev 3211) +++ trunk/spambayes/contrib/nway.py 2008-11-25 03:50:08 UTC (rev 3212) @@ -70,7 +70,7 @@ prog = os.path.basename(sys.argv[0]) -def help(): +def usage(): print >> sys.stderr, __doc__ % globals() def main(args): @@ -78,10 +78,9 @@ for opt, arg in opts: if opt == '-h': - help() + usage() return 0 - tagdb_list = [] msg = mboxutils.get_message(sys.stdin) try: del msg["X-Spambayes-Classification"] Modified: trunk/spambayes/contrib/pycksum.py =================================================================== --- trunk/spambayes/contrib/pycksum.py 2008-11-25 03:45:39 UTC (rev 3211) +++ trunk/spambayes/contrib/pycksum.py 2008-11-25 03:50:08 UTC (rev 3212) @@ -39,7 +39,10 @@ import sys import email.Parser import email.generator -import md5 +try: + from hashlib import md5 +except ImportError: + from md5 import new as md5 import anydbm import re import time @@ -97,12 +100,12 @@ body = text.split("\n\n", 1)[1] lines = clean(body).split("\n") chunksize = len(lines)//4+1 - sum = [] + digest = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) - sum.append(md5.new(chunk).hexdigest()) + digest.append(md5(chunk).hexdigest()) - return ".".join(sum) + return ".".join(digest) def save_checksum(cksum, f): pieces = cksum.split('.') @@ -118,12 +121,12 @@ if not db.has_key(subsum): db[subsum] = str(time.time()) if len(db) > maxdblen: - items = [(float(db[k]),k) for k in db.keys()] + items = [(float(db[k]), k) for k in db.keys()] items.sort() # the -20 brings us down a bit below the max so we aren't # constantly running this chunk of code items = items[:-(maxdblen-20)] - for v,k in items: + for v, k in items: del db[k] else: result = 0 Modified: trunk/spambayes/contrib/sb_culler.py =================================================================== --- trunk/spambayes/contrib/sb_culler.py 2008-11-25 03:45:39 UTC (rev 3211) +++ trunk/spambayes/contrib/sb_culler.py 2008-11-25 03:50:08 UTC (rev 3212) @@ -30,20 +30,23 @@ This program requires Python 2.3 or newer. """ -import sets, traceback, md5, os +import socket +socket.setdefaulttimeout(10) + +import traceback, md5, os import poplib import posixpath + +import sets from email import Header, Utils from spambayes import mboxutils, hammie +from spambayes.Options import options -import socket -socket.setdefaulttimeout(10) - DO_ACTIONS = 1 VERBOSE_LEVEL = 1 APPEND_TO_FILE = "append_to_file" -DELETE = "delete" +DELETE_FROM_MAILBOX = "delete" KEEP_IN_MAILBOX = "keep in mailbox" SPAM = "spam" VIRUS = "virus" @@ -108,7 +111,7 @@ def DELETE(mi, log): """Action: delete message from mailbox""" - log.do_action(DELETE) + log.do_action(DELETE_FROM_MAILBOX) if not DO_ACTIONS: return mi.mailbox.dele(mi.i) Modified: trunk/spambayes/contrib/spamcounts.py =================================================================== --- trunk/spambayes/contrib/spamcounts.py 2008-11-25 03:45:39 UTC (rev 3211) +++ trunk/spambayes/contrib/spamcounts.py 2008-11-25 03:50:08 UTC (rev 3212) @@ -19,13 +19,11 @@ import getopt import re import sets -import os -import shelve import csv -from spambayes.Options import options, get_pathname_option +from spambayes.Options import options from spambayes.tokenizer import tokenize -from spambayes.storage import STATE_KEY, database_type, open_storage +from spambayes.storage import database_type, open_storage prog = sys.argv[0] Modified: trunk/spambayes/contrib/tte.py =================================================================== --- trunk/spambayes/contrib/tte.py 2008-11-25 03:45:39 UTC (rev 3211) +++ trunk/spambayes/contrib/tte.py 2008-11-25 03:50:08 UTC (rev 3212) @@ -100,7 +100,7 @@ def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose, ratio): - smisses = hmisses = round = 0 + round = 0 ham_cutoff = Options.options["Categorization", "ham_cutoff"] spam_cutoff = Options.options["Categorization", "spam_cutoff"] @@ -114,19 +114,19 @@ hambone_ = list(reversed(hambone_)) spamcan_ = list(reversed(spamcan_)) - nspam,nham = len(spamcan_),len(hambone_) + nspam, nham = len(spamcan_), len(hambone_) if ratio: - rspam,rham = ratio + rspam, rham = ratio # If the actual ratio of spam to ham in the database is better than # what was asked for, use that better ratio. if (rspam > rham) == (rspam * nham > rham * nspam): - rspam,rham = nspam,nham + rspam, rham = nspam, nham # define some indexing constants ham = 0 spam = 1 name = ('ham','spam') - misses = [0,0] + misses = [0, 0] misclassified = lambda is_spam, score: ( is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff) @@ -140,9 +140,9 @@ hambone = iter(hambone_) spamcan = iter(spamcan_) - i = [0,0] + i = [0, 0] msgs_processed = 0 - misses = [0,0] + misses = [0, 0] training_sets = [hambone, spamcan] while not maxmsgs or msgs_processed < maxmsgs: @@ -153,7 +153,7 @@ try: train_msg = training_sets[train_spam].next() except StopIteration: - break; + break i[train_spam] += 1 msgs_processed += 1 @@ -164,7 +164,7 @@ score = store.spamprob(tokens) selector = train_msg["message-id"] or train_msg["subject"] - if misclassified(train_spam,score) and selector is not None: + if misclassified(train_spam, score) and selector is not None: if verbose: print >> sys.stderr, "\tmiss %s: %.6f %s" % ( name[train_spam], score, selector) @@ -179,24 +179,25 @@ print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \ (round, msgs_processed, misses[0], misses[1], seconds) - training_sets = [hambone,spamcan] + training_sets = [hambone, spamcan] # We count all untrained messages so the user knows what was skipped. # We also tag them for saving so we don't lose messages which might have # value in a future run - for is_spam in ham,spam: + for is_spam in ham, spam: nleft = 0 try: while True: msg = training_sets[is_spam].next() score = store.spamprob(tokenize(msg)) - if misclassified(is_spam,score): + if misclassified(is_spam, score): tdict[msg["message-id"]] = True nleft += 1 except StopIteration: - if nleft: print nleft, "untrained %ss" % name[is_spam] + if nleft: + print nleft, "untrained %ss" % name[is_spam] def cull(mbox_name, cullext, designation, tdict): print "writing new %s mbox..." % designation This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 04:56:07 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 03:56:07 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3213] trunk/spambayes/scripts/sb_bnfilter.py Message-ID: Revision: 3213 http://spambayes.svn.sourceforge.net/spambayes/?rev=3213&view=rev Author: montanaro Date: 2008-11-25 03:56:06 +0000 (Tue, 25 Nov 2008) Log Message: ----------- reset refused_count when forking server. pylint nits. Modified Paths: -------------- trunk/spambayes/scripts/sb_bnfilter.py Modified: trunk/spambayes/scripts/sb_bnfilter.py =================================================================== --- trunk/spambayes/scripts/sb_bnfilter.py 2008-11-25 03:50:08 UTC (rev 3212) +++ trunk/spambayes/scripts/sb_bnfilter.py 2008-11-25 03:56:06 UTC (rev 3213) @@ -157,40 +157,44 @@ if error: sys.exit(error) -def make_socket(server_options, file): +def make_socket(server_options, filename): refused_count = 0 no_server_count = 0 while 1: try: - s = socket.socket(socket.AF_UNIX,socket.SOCK_STREAM) - s.connect(file) + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect(filename) except socket.error,e: if e[0] == errno.EAGAIN: # baaah pass - elif e[0] == errno.ENOENT or not os.path.exists(file): - # We need to check os.path.exists for use on operating systems that - # never return ENOENT; linux 2.2. + elif e[0] == errno.ENOENT or not os.path.exists(filename): + # We need to check os.path.exists for use on operating + # systems that never return ENOENT; linux 2.2. # # no such file.... no such server. create one. no_server_count += 1 - if no_server_count>4: + if no_server_count > 4: raise + # Reset refused count to start the sleep process over. + # Otherwise we run the risk of waiting a *really* long time + # and/or hitting the refused_count limit. + refused_count = 0 fork_server(server_options) elif e[0] == errno.ECONNREFUSED: # socket file exists but noone listening. refused_count += 1 - if refused_count == 6: + if refused_count == 4: # We have been waiting ages and still havent been able # to connect. Maybe that socket file has got # orphaned. remove it, wait, and try again. We need to # allow enough time for sb_bnserver to initialise the # rest of spambayes try: - os.unlink(file) + os.unlink(filename) except EnvironmentError: pass - elif refused_count>6: + elif refused_count > 6: raise else: raise # some other problem @@ -212,9 +216,9 @@ os.setsid() # Use exec rather than import here because eventually it may be nice to # reimplement this one file in C - os.execv(sys.executable,[sys.executable, - os.path.join(os.path.split(sys.argv[0])[0], - 'sb_bnserver.py') ]+options) + os.execv(sys.executable, [sys.executable, + os.path.join(os.path.split(sys.argv[0])[0], + 'sb_bnserver.py') ]+options) # should never get here sys._exit(1) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 05:01:34 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 04:01:34 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3214] trunk/spambayes/scripts Message-ID: Revision: 3214 http://spambayes.svn.sourceforge.net/spambayes/?rev=3214&view=rev Author: montanaro Date: 2008-11-25 04:01:34 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits, use hashlib, use safepickle functions Modified Paths: -------------- trunk/spambayes/scripts/core_server.py trunk/spambayes/scripts/sb_bnserver.py trunk/spambayes/scripts/sb_client.py trunk/spambayes/scripts/sb_dbexpimp.py trunk/spambayes/scripts/sb_imapfilter.py trunk/spambayes/scripts/sb_mailsort.py trunk/spambayes/scripts/sb_mboxtrain.py trunk/spambayes/scripts/sb_notesfilter.py trunk/spambayes/scripts/sb_pop3dnd.py trunk/spambayes/scripts/sb_server.py trunk/spambayes/scripts/sb_upload.py trunk/spambayes/scripts/sb_xmlrpcserver.py Modified: trunk/spambayes/scripts/core_server.py =================================================================== --- trunk/spambayes/scripts/core_server.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/core_server.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -94,11 +94,9 @@ o Graphs. Of something. Who cares what? """ -import sys, getopt, time -from email.Header import Header +import sys, getopt from spambayes import Dibbler -from spambayes import storage from spambayes.Options import options, _ from spambayes.UserInterface import UserInterfaceServer from spambayes.Version import get_current_version Modified: trunk/spambayes/scripts/sb_bnserver.py =================================================================== --- trunk/spambayes/scripts/sb_bnserver.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_bnserver.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -27,7 +27,7 @@ unix domain socket used on which we listen """ -import os, getopt, sys, SocketServer, time, traceback, select, socket, errno +import os, getopt, sys, SocketServer, traceback, select, socket, errno try: True, False @@ -63,7 +63,7 @@ try: server = BNServer(args[0], BNRequest) except socket.error,e: - if e[0]==errno.EADDRINUSE: + if e[0] == errno.EADDRINUSE: pass # in use, no need else: raise # a real error @@ -108,7 +108,7 @@ pass def get_request(self): - r,w,e = select.select([self.socket], [], [], self.timeout) + r, w, e = select.select([self.socket], [], [], self.timeout) if r: return self.socket.accept() else: @@ -119,15 +119,16 @@ switches = self.rfile.readline() body = self.rfile.read() try: - response = self._calc_response(switches,body) + response = self._calc_response(switches, body) self.wfile.write('0\n%d\n'%(len(response),)) self.wfile.write(response) except: - response = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] + response = traceback.format_exception_only(sys.exc_info()[0], + sys.exc_info()[1])[0] self.wfile.write('1\n%d\n'%(len(response),)) self.wfile.write(response) - def _calc_response(self,switches,body): + def _calc_response(self, switches, body): switches = switches.split() actions = [] opts, args = getopt.getopt(switches, 'fgstGS') Modified: trunk/spambayes/scripts/sb_client.py =================================================================== --- trunk/spambayes/scripts/sb_client.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_client.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -10,7 +10,7 @@ import xmlrpclib import sys -RPCBASE="http://localhost:65000" +RPCBASE = "http://localhost:65000" def main(): msg = sys.stdin.read() Modified: trunk/spambayes/scripts/sb_dbexpimp.py =================================================================== --- trunk/spambayes/scripts/sb_dbexpimp.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_dbexpimp.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -101,8 +101,7 @@ import spambayes.storage from spambayes.Options import options -import sys, os, getopt, errno, re -import urllib +import sys, os, getopt, errno from types import UnicodeType def uquote(s): @@ -137,8 +136,8 @@ writer = csv.writer(fp) - nham = bayes.nham; - nspam = bayes.nspam; + nham = bayes.nham + nspam = bayes.nspam print "Exporting database %s to file %s" % (dbFN, outFN) print "Database has %s ham, %s spam, and %s words" \ @@ -215,7 +214,7 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'iehmvd:p:f:o:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() useDBM = "pickle" @@ -227,7 +226,7 @@ for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-f': flatFN = arg @@ -247,4 +246,4 @@ if imp: runImport(dbFN, useDBM, newDBM, flatFN) else: - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ Modified: trunk/spambayes/scripts/sb_imapfilter.py =================================================================== --- trunk/spambayes/scripts/sb_imapfilter.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_imapfilter.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -110,7 +110,7 @@ else: temp_dir = win32api.GetTempPath() status = "Log file opened in " + temp_dir - for i in range(3,0,-1): + for i in range(3, 0, -1): try: os.unlink(os.path.join(temp_dir, "SpamBayesIMAP%d.log" % (i+1))) except os.error: @@ -132,20 +132,15 @@ import getopt import types import thread -import traceback import email import email.Parser from getpass import getpass from email.Utils import parsedate -try: - import cStringIO as StringIO -except ImportError: - import StringIO from spambayes import Stats from spambayes import message -from spambayes.Options import options, get_pathname_option, optionsPathname -from spambayes import tokenizer, storage, Dibbler +from spambayes.Options import options, optionsPathname +from spambayes import storage, Dibbler from spambayes.UserInterface import UserInterfaceServer from spambayes.ImapUI import IMAPUserInterface, LoginFailure @@ -177,7 +172,6 @@ timeout = 60 # seconds def __init__(self, server, debug=0, do_expunge = options["imap", "expunge"] ): - if server.find(':') > -1: server, port = server.split(':', 1) port = int(port) @@ -494,7 +488,7 @@ class IMAPMessage(message.SBHeaderMessage): def __init__(self): - message.Message.__init__(self) + message.SBHeaderMessage.__init__(self) self.folder = None self.previous_folder = None self.rfc822_command = "(BODY.PEEK[])" @@ -548,7 +542,7 @@ # Can't select the folder, so getting the substance will not # work. self.could_not_retrieve = True - print >>sys.stderr, "Could not select folder %s for message " \ + print >> sys.stderr, "Could not select folder %s for message " \ "%s (uid %s)" % (self.folder.name, self.id, self.uid) return self @@ -571,7 +565,7 @@ # characters for classification. For now, we just carry on, # warning the user and ignoring the message. self.could_not_retrieve = True - print >>sys.stderr, "MemoryError with message %s (uid %s)" % \ + print >> sys.stderr, "MemoryError with message %s (uid %s)" % \ (self.id, self.uid) return self @@ -614,7 +608,7 @@ self.got_substance = True # Print the exception and a traceback. - print >>sys.stderr, details + print >> sys.stderr, details return self @@ -660,7 +654,7 @@ We can't actually update the message with IMAP, so what we do is create a new message and delete the old one.""" - assert self.folder is not None,\ + assert self.folder is not None, \ "Can't save a message that doesn't have a folder." assert self.id, "Can't save a message that doesn't have an id." assert self.imap_server, "Can't do anything without IMAP connection." @@ -733,7 +727,8 @@ data = self.imap_server.check_response("recent", response) if data[0] is not None: if options["globals", "verbose"]: - print "[imapfilter] found saved message %s in iteration" % self.uid, i + print "[imapfilter] found saved message", self.uid, + print "in iteration", i break else: if options["globals", "verbose"]: @@ -963,7 +958,7 @@ cls = msg.GetClassification() if cls is None or hamfolder is not None: if options["globals", "verbose"]: - print "[imapfilter] classified as %s:"%cls, msg.uid + print "[imapfilter] classified as %s:" % cls, msg.uid msg = msg.get_full_message() if msg.could_not_retrieve: @@ -1140,13 +1135,13 @@ for u in usernames: pwds.append(getpass("Enter password for %s:" % (u,))) - return zip(servers,usernames,pwds) + return zip(servers, usernames, pwds) def run(force_UI=False): try: opts, args = getopt.getopt(sys.argv[1:], 'hbPtcvl:e:i:d:p:o:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() doTrain = False @@ -1159,7 +1154,7 @@ for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == "-b": launchUI = True @@ -1248,7 +1243,7 @@ IMAPSession, stats=stats, close_db=close_db, change_db=change_db)) - launchBrowser=launchUI or options["html_ui", "launch_browser"] + launchBrowser = launchUI or options["html_ui", "launch_browser"] if sleepTime: # Run in a separate thread, as we have more work to do. thread.start_new_thread(Dibbler.run, (), Modified: trunk/spambayes/scripts/sb_mailsort.py =================================================================== --- trunk/spambayes/scripts/sb_mailsort.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_mailsort.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -25,7 +25,7 @@ import time import signal import socket -import email +import errno DB_FILE = os.path.expanduser(DB_FILE) @@ -65,7 +65,7 @@ try: fd = os.open(pathname, os.O_WRONLY|os.O_CREAT|os.O_EXCL, 0600) except IOError, exc: - if exc[i] not in (errno.EINT, errno.EEXIST): + if exc[0] not in (errno.EINT, errno.EEXIST): raise else: break @@ -142,7 +142,7 @@ prob, evidence = bayes.spamprob(tokenize(msg), evidence=True) print msg_name, prob for word, prob in evidence: - print ' ', `word`, prob + print ' ', repr(word), prob def main(): global DB_FILE, CONFIG_FILE Modified: trunk/spambayes/scripts/sb_mboxtrain.py =================================================================== --- trunk/spambayes/scripts/sb_mboxtrain.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_mboxtrain.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -111,7 +111,8 @@ def maildir_train(h, path, is_spam, force, removetrained): """Train bayes with all messages from a maildir.""" - if loud: print " Reading %s as Maildir" % (path,) + if loud: + print " Reading %s as Maildir" % (path,) import time import socket @@ -162,7 +163,8 @@ def mbox_train(h, path, is_spam, force): """Train bayes with a Unix mbox""" - if loud: print " Reading as Unix mbox" + if loud: + print " Reading as Unix mbox" import mailbox import fcntl @@ -219,7 +221,8 @@ def mhdir_train(h, path, is_spam, force): """Train bayes with an mh directory""" - if loud: print " Reading as MH mailbox" + if loud: + print " Reading as MH mailbox" import glob @@ -331,13 +334,15 @@ h = hammie.open(pck, usedb, "c") for g in good: - if loud: print "Training ham (%s):" % g + if loud: + print "Training ham (%s):" % g train(h, g, False, force, trainnew, removetrained) sys.stdout.flush() save = True for s in spam: - if loud: print "Training spam (%s):" % s + if loud: + print "Training spam (%s):" % s train(h, s, True, force, trainnew, removetrained) sys.stdout.flush() save = True Modified: trunk/spambayes/scripts/sb_notesfilter.py =================================================================== --- trunk/spambayes/scripts/sb_notesfilter.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_notesfilter.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -144,14 +144,15 @@ return not not val import sys -from spambayes import tokenizer, storage -from spambayes.Options import options -import cPickle as pickle import errno +import getopt + import win32com.client import pywintypes -import getopt +from spambayes import tokenizer, storage +from spambayes.Options import options +from spambayes.safepickle import pickle_read, pickle_write def classifyInbox(v, vmoveto, bayes, ldbname, notesindex, log): @@ -187,20 +188,18 @@ # probably due to this unicode problem. options["Tokenizer", "generate_long_skips"] = False tokens = tokenizer.tokenize(message) - prob, clues = bayes.spamprob(tokens, evidence=True) + prob = bayes.spamprob(tokens) if prob < options["Categorization", "ham_cutoff"]: - disposition = options["Headers", "header_ham_string"] numham += 1 elif prob > options["Categorization", "spam_cutoff"]: - disposition = options["Headers", "header_spam_string"] docstomove += [doc] numspam += 1 else: - disposition = options["Headers", "header_unsure_string"] numuns += 1 notesindex[nid] = 'classified' + subj = message["subject"] try: print "%s spamprob is %s" % (subj[:30], prob) if log: @@ -305,16 +304,13 @@ bayes = storage.open_storage(bdbname, useDBM) try: - fp = open(idxname, 'rb') + notesindex = pickle_read(idxname) except IOError, e: if e.errno != errno.ENOENT: raise notesindex = {} print "%s file not found, this is a first time run" % (idxname,) print "No classification will be performed" - else: - notesindex = pickle.load(fp) - fp.close() need_replicate = False @@ -378,9 +374,7 @@ bayes.store() - fp = open(idxname, 'wb') - pickle.dump(notesindex, fp) - fp.close() + pickle_write(idxname, notesindex) if log: log.LogAction("Finished running spambayes") @@ -390,7 +384,7 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'htcPd:p:l:r:f:o:i:W:L:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() ldbname = None # local notes database name @@ -405,7 +399,7 @@ for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-l': ldbname = arg @@ -437,9 +431,6 @@ sbfname, doTrain, doClassify, pwd, idxname, logname) if doPrompt: - try: - key = input("Press Enter to end") - except SyntaxError: - pass + raw_input("Press Enter to end ") else: - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ Modified: trunk/spambayes/scripts/sb_pop3dnd.py =================================================================== --- trunk/spambayes/scripts/sb_pop3dnd.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_pop3dnd.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -64,16 +64,17 @@ import os import re import sys -import md5 +try: + from hashlib import md5 +except ImportError: + from md5 import new as md5 import time import errno -import types import email import thread import getopt import socket import imaplib -import operator import email.Utils try: @@ -85,14 +86,12 @@ import twisted.application.app from twisted.internet import defer from twisted.internet import reactor -from twisted.internet.defer import maybeDeferred from twisted.internet.protocol import ServerFactory from twisted.protocols.imap4 import IMessage -from twisted.protocols.imap4 import parseNestedParens, parseIdList -from twisted.protocols.imap4 import IllegalClientResponse, IAccount -from twisted.protocols.imap4 import collapseNestedLists, MessageSet +from twisted.protocols.imap4 import IAccount +from twisted.protocols.imap4 import MessageSet from twisted.protocols.imap4 import IMAP4Server, MemoryAccount, IMailbox -from twisted.protocols.imap4 import IMailboxListener, collapseNestedLists +from twisted.protocols.imap4 import IMailboxListener from spambayes import storage from spambayes import message @@ -101,7 +100,7 @@ from spambayes.tokenizer import tokenize from spambayes import FileCorpus, Dibbler from spambayes.Version import get_current_version -from sb_server import POP3ProxyBase, State, _addressPortStr, _recreateState +from sb_server import POP3ProxyBase, State, _addressPortStr def ensureDir(dirname): """Ensure that the given directory exists - in other words, if it @@ -263,7 +262,7 @@ if part.get_main_type() == "text": part_s.append(str(part.as_string().count("\n"))) if ext: - part_s.extend([md5.new(part.as_string()).digest(), + part_s.extend([md5(part.as_string()).digest(), part.get('Content-Disposition'), part.get('Content-Language')]) s.append(part_s) @@ -538,7 +537,7 @@ class SpambayesInbox(SpambayesMailbox): """A special mailbox that holds status messages from SpamBayes.""" def __init__(self, id, state): - IMAPMailbox.__init__(self, "INBOX", "spambayes", id) + SpambayesMailbox.__init__(self, "INBOX", "spambayes", id) self.mdb = state.mdb self.UID_validity = id self.nextUID = 1 @@ -826,8 +825,8 @@ msg = email.message_from_string(messageText, _class=message.SBHeaderMessage) # Now find the spam disposition and add the header. - (prob, clues) = state.bayes.spamprob(msg.tokenize(),\ - evidence=True) + (prob, clues) = state.bayes.spamprob(msg.tokenize(), + evidence=True) # Note that the X-SpamBayes-MailID header will be worthless # because we don't know the message id at this point. It's @@ -870,7 +869,7 @@ message.insert_exception_header(messageText) # Print the exception and a traceback. - print >>sys.stderr, details + print >> sys.stderr, details retval = ok + "\n" + messageText if terminatingDotPresent: retval += '.\r\n' @@ -1009,12 +1008,12 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'ho:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) Modified: trunk/spambayes/scripts/sb_server.py =================================================================== --- trunk/spambayes/scripts/sb_server.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_server.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -102,16 +102,15 @@ o NNTP proxy. """ -import os, sys, re, errno, getopt, time, traceback, socket, cStringIO, email +import sys, re, getopt, time, socket, email from thread import start_new_thread -from email.Header import Header import spambayes.message from spambayes import i18n from spambayes import Stats from spambayes import Dibbler from spambayes import storage -from spambayes.FileCorpus import FileCorpus, ExpiryFileCorpus +from spambayes.FileCorpus import ExpiryFileCorpus from spambayes.FileCorpus import FileMessageFactory, GzipFileMessageFactory from spambayes.Options import options, get_pathname_option, _ from spambayes.UserInterface import UserInterfaceServer @@ -191,7 +190,7 @@ except socket.sslerror, why: if why[0] == 1: # error:140770FC:SSL routines:SSL23_GET_SERVER_HELLO:unknown protocol' # Probably not SSL after all. - print >>sys.stderr, "Can't use SSL" + print >> sys.stderr, "Can't use SSL" else: raise else: @@ -367,8 +366,7 @@ raise SystemExit elif verb == 'CRASH': # For testing - x = 0 - y = 1/x + raise ZeroDivisionError self.serverSocket.push(self.request + '\r\n') if self.request.strip() == '': @@ -568,8 +566,8 @@ _class=spambayes.message.SBHeaderMessage) msg.setId(state.getNewMessageName()) # Now find the spam disposition and add the header. - (prob, clues) = state.bayes.spamprob(msg.tokenize(),\ - evidence=True) + (prob, clues) = state.bayes.spamprob(msg.tokenize(), + evidence=True) msg.addSBHeaders(prob, clues) @@ -632,7 +630,7 @@ insert_exception_header(messageText) # Print the exception and a traceback. - print >>sys.stderr, details + print >> sys.stderr, details # Restore the +OK and the POP3 .\r\n terminator if there was one. retval = ok + "\n" + messageText @@ -836,7 +834,6 @@ nham = self.bayes.nham if nspam > 10 and nham > 10: db_ratio = nham/float(nspam) - big = small = None if db_ratio > 5.0: self.warning = _("Warning: you have much more ham than " \ "spam - SpamBayes works best with " \ @@ -988,8 +985,6 @@ proxyListeners.append(listener) def _recreateState(): - global state - # Close the existing listeners and create new ones. This won't # affect any running proxies - once a listener has created a proxy, # that proxy is then independent of it. @@ -1057,13 +1052,12 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'hbd:p:l:u:o:') except getopt.error, msg: - print >>sys.stderr, str(msg) + '\n\n' + __doc__ + print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() - runSelfTest = False for opt, arg in opts: if opt == '-h': - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ sys.exit() elif opt == '-b': state.launchUI = True @@ -1096,14 +1090,14 @@ try: prepare() except AlreadyRunningException: - print >>sys.stderr, \ + print >> sys.stderr, \ "ERROR: The proxy is already running on this machine." - print >>sys.stderr, "Please stop the existing proxy and try again" + print >> sys.stderr, "Please stop the existing proxy and try again" return start() else: - print >>sys.stderr, __doc__ + print >> sys.stderr, __doc__ if __name__ == '__main__': run() Modified: trunk/spambayes/scripts/sb_upload.py =================================================================== --- trunk/spambayes/scripts/sb_upload.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_upload.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -58,7 +58,7 @@ h.putheader('content-length', str(len(body))) h.endheaders() h.send(body) - errcode, errmsg, headers = h.getreply() + h.getreply() return h.file.read() def encode_multipart_formdata(fields, files): @@ -153,7 +153,7 @@ ("text", "")], [("file", "message.dat", data)]) else: - post_multipart("%s:%d" % (server,port), "/upload", [], + post_multipart("%s:%d" % (server, port), "/upload", [], [('file', 'message.dat', data)]) except: # not an error if the server isn't responding Modified: trunk/spambayes/scripts/sb_xmlrpcserver.py =================================================================== --- trunk/spambayes/scripts/sb_xmlrpcserver.py 2008-11-25 03:56:06 UTC (rev 3213) +++ trunk/spambayes/scripts/sb_xmlrpcserver.py 2008-11-25 04:01:34 UTC (rev 3214) @@ -22,10 +22,8 @@ Port number to listen to. """ -import os import getopt import sys -import traceback import xmlrpclib import SimpleXMLRPCServer This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 04:29:21 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 03:29:21 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3206] trunk/spambayes Message-ID: Revision: 3206 http://spambayes.svn.sourceforge.net/spambayes/?rev=3206&view=rev Author: montanaro Date: 2008-11-25 03:29:21 +0000 (Tue, 25 Nov 2008) Log Message: ----------- accidentally checked in the whole shootin' match Modified Paths: -------------- trunk/spambayes/CHANGELOG.txt trunk/spambayes/contrib/SmarterHTTPServer.py trunk/spambayes/contrib/bulkgraph.py trunk/spambayes/contrib/findbest.py trunk/spambayes/contrib/mod_spambayes.py trunk/spambayes/contrib/nway.py trunk/spambayes/contrib/pycksum.py trunk/spambayes/contrib/sb_culler.py trunk/spambayes/contrib/spamcounts.py trunk/spambayes/contrib/tte.py trunk/spambayes/pspam/pop.py trunk/spambayes/pspam/scoremsg.py trunk/spambayes/pspam/update.py trunk/spambayes/scripts/core_server.py trunk/spambayes/scripts/sb_bnfilter.py trunk/spambayes/scripts/sb_bnserver.py trunk/spambayes/scripts/sb_client.py trunk/spambayes/scripts/sb_dbexpimp.py trunk/spambayes/scripts/sb_imapfilter.py trunk/spambayes/scripts/sb_mailsort.py trunk/spambayes/scripts/sb_mboxtrain.py trunk/spambayes/scripts/sb_notesfilter.py trunk/spambayes/scripts/sb_pop3dnd.py trunk/spambayes/scripts/sb_server.py trunk/spambayes/scripts/sb_upload.py trunk/spambayes/scripts/sb_xmlrpcserver.py trunk/spambayes/setup.py trunk/spambayes/spambayes/ImapUI.py trunk/spambayes/spambayes/Options.py trunk/spambayes/spambayes/Version.py trunk/spambayes/spambayes/__init__.py trunk/spambayes/spambayes/chi2.py trunk/spambayes/spambayes/optimize.py trunk/spambayes/spambayes/storage.py trunk/spambayes/spambayes/tokenizer.py trunk/spambayes/testtools/es2hs.py trunk/spambayes/testtools/incremental.py trunk/spambayes/utilities/HistToGNU.py trunk/spambayes/utilities/convert_config_file.py trunk/spambayes/utilities/convert_db.py trunk/spambayes/utilities/extractmessages.py trunk/spambayes/utilities/hammer.py trunk/spambayes/utilities/loosecksum.py trunk/spambayes/utilities/mboxcount.py trunk/spambayes/utilities/mkreversemap.py trunk/spambayes/utilities/split.py trunk/spambayes/utilities/splitn.py trunk/spambayes/utilities/splitndirs.py trunk/spambayes/windows/autoconfigure.py trunk/spambayes/windows/pop3proxy_service.py Modified: trunk/spambayes/CHANGELOG.txt =================================================================== --- trunk/spambayes/CHANGELOG.txt 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/CHANGELOG.txt 2008-11-25 03:29:21 UTC (rev 3206) @@ -1,11 +1,5 @@ [Note that all dates are in ISO 8601 format, e.g. YYYY-MM-DD to ease sorting] -Release 1.1b1 -============= - -Skip Montanaro 2008-11-23 Route all pickle reads and writes through safepickle module. -Skip Montanaro 2008-11-23 Pick off a bunch of pylint nit (still tons to do). - Release 1.1a5 ============= Modified: trunk/spambayes/contrib/SmarterHTTPServer.py =================================================================== --- trunk/spambayes/contrib/SmarterHTTPServer.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/contrib/SmarterHTTPServer.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -20,6 +20,7 @@ import SimpleHTTPServer import urllib import cgi +import shutil import mimetypes import re try: Modified: trunk/spambayes/contrib/bulkgraph.py =================================================================== --- trunk/spambayes/contrib/bulkgraph.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/contrib/bulkgraph.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -32,15 +32,15 @@ quiet mode; no output """ +import mboxutils import getopt +import hammie import sys import os import re import time import filecmp -from spambayes import mboxutils, hammie - program = sys.argv[0] loud = True day = 24 * 60 * 60 Modified: trunk/spambayes/contrib/findbest.py =================================================================== --- trunk/spambayes/contrib/findbest.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/contrib/findbest.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -66,6 +66,7 @@ import sys import os +import cPickle as pickle import getopt import math @@ -74,8 +75,6 @@ from spambayes.hammie import Hammie from spambayes.tokenizer import tokenize from spambayes.Options import options -from spambayes import storage -from spambayes.safepickle import pickle_read, pickle_write cls = Classifier() h = Hammie(cls) @@ -99,6 +98,7 @@ def score(unsure, h, cls, scores, msgids=None, skipspam=False): """See what effect on others each msg in unsure has""" + ham_cutoff = options["Categorization", "ham_cutoff"] spam_cutoff = options["Categorization", "spam_cutoff"] # compute a base - number of messages in unsure already in the @@ -223,7 +223,7 @@ print "scoring" if best: - last_scores = pickle_read(bestfile) + last_scores = pickle.load(file(bestfile)) last_scores = last_scores.items() last_scores.sort() msgids = set() @@ -240,7 +240,7 @@ pass if not best: - pickle_write(bestfile, scores) + pickle.dump(scores, file(bestfile, 'w')) return 0 Modified: trunk/spambayes/contrib/mod_spambayes.py =================================================================== --- trunk/spambayes/contrib/mod_spambayes.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/contrib/mod_spambayes.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -5,24 +5,26 @@ ## Author: Skip Montanaro ## +import os + from proxy3_filter import * import proxy3_options -from spambayes import hammie, Options +from spambayes import hammie, Options, mboxutils dbf = Options.get_pathname_option("Storage", "persistent_storage_file") class SpambayesFilter(BufferAllFilter): - checker = hammie.open(dbf, 1, 'r') + hammie = hammie.open(dbf, 1, 'r') def filter(self, s): if self.reply.split()[1] == '200': - prob = self.checker.score("%s\r\n%s" % (self.serverheaders, s)) + prob = self.hammie.score("%s\r\n%s" % (self.serverheaders, s)) print "| prob: %.5f" % prob if prob >= Options.options["Categorization", "spam_cutoff"]: print self.serverheaders print "text:", s[0:40], "...", s[-40:] return "not authorized" - return s + return s from proxy3_util import * Modified: trunk/spambayes/contrib/nway.py =================================================================== --- trunk/spambayes/contrib/nway.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/contrib/nway.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -70,7 +70,7 @@ prog = os.path.basename(sys.argv[0]) -def usage(): +def help(): print >> sys.stderr, __doc__ % globals() def main(args): @@ -78,9 +78,10 @@ for opt, arg in opts: if opt == '-h': - usage() + help() return 0 + tagdb_list = [] msg = mboxutils.get_message(sys.stdin) try: del msg["X-Spambayes-Classification"] Modified: trunk/spambayes/contrib/pycksum.py =================================================================== --- trunk/spambayes/contrib/pycksum.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/contrib/pycksum.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -97,12 +97,12 @@ body = text.split("\n\n", 1)[1] lines = clean(body).split("\n") chunksize = len(lines)//4+1 - digest = [] + sum = [] for i in range(4): chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize]) - digest.append(md5.new(chunk).hexdigest()) + sum.append(md5.new(chunk).hexdigest()) - return ".".join(digest) + return ".".join(sum) def save_checksum(cksum, f): pieces = cksum.split('.') @@ -118,12 +118,12 @@ if not db.has_key(subsum): db[subsum] = str(time.time()) if len(db) > maxdblen: - items = [(float(db[k]), k) for k in db.keys()] + items = [(float(db[k]),k) for k in db.keys()] items.sort() # the -20 brings us down a bit below the max so we aren't # constantly running this chunk of code items = items[:-(maxdblen-20)] - for v, k in items: + for v,k in items: del db[k] else: result = 0 Modified: trunk/spambayes/contrib/sb_culler.py =================================================================== --- trunk/spambayes/contrib/sb_culler.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/contrib/sb_culler.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -30,23 +30,20 @@ This program requires Python 2.3 or newer. """ -import socket -socket.setdefaulttimeout(10) - -import traceback, md5, os +import sets, traceback, md5, os import poplib import posixpath - -import sets from email import Header, Utils from spambayes import mboxutils, hammie -from spambayes.Options import options +import socket +socket.setdefaulttimeout(10) + DO_ACTIONS = 1 VERBOSE_LEVEL = 1 APPEND_TO_FILE = "append_to_file" -DELETE_FROM_MAILBOX = "delete" +DELETE = "delete" KEEP_IN_MAILBOX = "keep in mailbox" SPAM = "spam" VIRUS = "virus" @@ -111,7 +108,7 @@ def DELETE(mi, log): """Action: delete message from mailbox""" - log.do_action(DELETE_FROM_MAILBOX) + log.do_action(DELETE) if not DO_ACTIONS: return mi.mailbox.dele(mi.i) Modified: trunk/spambayes/contrib/spamcounts.py =================================================================== --- trunk/spambayes/contrib/spamcounts.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/contrib/spamcounts.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -19,11 +19,13 @@ import getopt import re import sets +import os +import shelve import csv -from spambayes.Options import options +from spambayes.Options import options, get_pathname_option from spambayes.tokenizer import tokenize -from spambayes.storage import database_type, open_storage +from spambayes.storage import STATE_KEY, database_type, open_storage prog = sys.argv[0] Modified: trunk/spambayes/contrib/tte.py =================================================================== --- trunk/spambayes/contrib/tte.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/contrib/tte.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -100,7 +100,7 @@ def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose, ratio): - round = 0 + smisses = hmisses = round = 0 ham_cutoff = Options.options["Categorization", "ham_cutoff"] spam_cutoff = Options.options["Categorization", "spam_cutoff"] @@ -114,19 +114,19 @@ hambone_ = list(reversed(hambone_)) spamcan_ = list(reversed(spamcan_)) - nspam, nham = len(spamcan_), len(hambone_) + nspam,nham = len(spamcan_),len(hambone_) if ratio: - rspam, rham = ratio + rspam,rham = ratio # If the actual ratio of spam to ham in the database is better than # what was asked for, use that better ratio. if (rspam > rham) == (rspam * nham > rham * nspam): - rspam, rham = nspam, nham + rspam,rham = nspam,nham # define some indexing constants ham = 0 spam = 1 name = ('ham','spam') - misses = [0, 0] + misses = [0,0] misclassified = lambda is_spam, score: ( is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff) @@ -140,9 +140,9 @@ hambone = iter(hambone_) spamcan = iter(spamcan_) - i = [0, 0] + i = [0,0] msgs_processed = 0 - misses = [0, 0] + misses = [0,0] training_sets = [hambone, spamcan] while not maxmsgs or msgs_processed < maxmsgs: @@ -153,7 +153,7 @@ try: train_msg = training_sets[train_spam].next() except StopIteration: - break + break; i[train_spam] += 1 msgs_processed += 1 @@ -164,7 +164,7 @@ score = store.spamprob(tokens) selector = train_msg["message-id"] or train_msg["subject"] - if misclassified(train_spam, score) and selector is not None: + if misclassified(train_spam,score) and selector is not None: if verbose: print >> sys.stderr, "\tmiss %s: %.6f %s" % ( name[train_spam], score, selector) @@ -179,25 +179,24 @@ print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \ (round, msgs_processed, misses[0], misses[1], seconds) - training_sets = [hambone, spamcan] + training_sets = [hambone,spamcan] # We count all untrained messages so the user knows what was skipped. # We also tag them for saving so we don't lose messages which might have # value in a future run - for is_spam in ham, spam: + for is_spam in ham,spam: nleft = 0 try: while True: msg = training_sets[is_spam].next() score = store.spamprob(tokenize(msg)) - if misclassified(is_spam, score): + if misclassified(is_spam,score): tdict[msg["message-id"]] = True nleft += 1 except StopIteration: - if nleft: - print nleft, "untrained %ss" % name[is_spam] + if nleft: print nleft, "untrained %ss" % name[is_spam] def cull(mbox_name, cullext, designation, tdict): print "writing new %s mbox..." % designation Modified: trunk/spambayes/pspam/pop.py =================================================================== --- trunk/spambayes/pspam/pop.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/pspam/pop.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -27,6 +27,7 @@ """ import SocketServer +import asyncore try: import cStringIO as StringIO except ImportError: @@ -36,8 +37,11 @@ import re import socket import sys +import threading import time +import ZODB +from ZEO.ClientStorage import ClientStorage import zLOG from spambayes.tokenizer import tokenize Modified: trunk/spambayes/pspam/scoremsg.py =================================================================== --- trunk/spambayes/pspam/scoremsg.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/pspam/scoremsg.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -6,7 +6,11 @@ import locale from types import UnicodeType +import ZODB +from ZEO.ClientStorage import ClientStorage + import pspam.database +from spambayes.Options import options from spambayes.tokenizer import tokenize try: Modified: trunk/spambayes/pspam/update.py =================================================================== --- trunk/spambayes/pspam/update.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/pspam/update.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -2,6 +2,9 @@ import os import sys +import ZODB +from ZEO.ClientStorage import ClientStorage + import pspam.database from pspam.profile import Profile Modified: trunk/spambayes/scripts/core_server.py =================================================================== --- trunk/spambayes/scripts/core_server.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/core_server.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -94,9 +94,11 @@ o Graphs. Of something. Who cares what? """ -import sys, getopt +import sys, getopt, time +from email.Header import Header from spambayes import Dibbler +from spambayes import storage from spambayes.Options import options, _ from spambayes.UserInterface import UserInterfaceServer from spambayes.Version import get_current_version Modified: trunk/spambayes/scripts/sb_bnfilter.py =================================================================== --- trunk/spambayes/scripts/sb_bnfilter.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_bnfilter.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -157,44 +157,40 @@ if error: sys.exit(error) -def make_socket(server_options, filename): +def make_socket(server_options, file): refused_count = 0 no_server_count = 0 while 1: try: - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.connect(filename) + s = socket.socket(socket.AF_UNIX,socket.SOCK_STREAM) + s.connect(file) except socket.error,e: if e[0] == errno.EAGAIN: # baaah pass - elif e[0] == errno.ENOENT or not os.path.exists(filename): - # We need to check os.path.exists for use on operating - # systems that never return ENOENT; linux 2.2. + elif e[0] == errno.ENOENT or not os.path.exists(file): + # We need to check os.path.exists for use on operating systems that + # never return ENOENT; linux 2.2. # # no such file.... no such server. create one. no_server_count += 1 - if no_server_count > 4: + if no_server_count>4: raise - # Reset refused count to start the sleep process over. - # Otherwise we run the risk of waiting a *really* long time - # and/or hitting the refused_count limit. - refused_count = 0 fork_server(server_options) elif e[0] == errno.ECONNREFUSED: # socket file exists but noone listening. refused_count += 1 - if refused_count == 4: + if refused_count == 6: # We have been waiting ages and still havent been able # to connect. Maybe that socket file has got # orphaned. remove it, wait, and try again. We need to # allow enough time for sb_bnserver to initialise the # rest of spambayes try: - os.unlink(filename) + os.unlink(file) except EnvironmentError: pass - elif refused_count > 6: + elif refused_count>6: raise else: raise # some other problem @@ -216,9 +212,9 @@ os.setsid() # Use exec rather than import here because eventually it may be nice to # reimplement this one file in C - os.execv(sys.executable, [sys.executable, - os.path.join(os.path.split(sys.argv[0])[0], - 'sb_bnserver.py') ]+options) + os.execv(sys.executable,[sys.executable, + os.path.join(os.path.split(sys.argv[0])[0], + 'sb_bnserver.py') ]+options) # should never get here sys._exit(1) Modified: trunk/spambayes/scripts/sb_bnserver.py =================================================================== --- trunk/spambayes/scripts/sb_bnserver.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_bnserver.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -27,7 +27,7 @@ unix domain socket used on which we listen """ -import os, getopt, sys, SocketServer, traceback, select, socket, errno +import os, getopt, sys, SocketServer, time, traceback, select, socket, errno try: True, False @@ -63,7 +63,7 @@ try: server = BNServer(args[0], BNRequest) except socket.error,e: - if e[0] == errno.EADDRINUSE: + if e[0]==errno.EADDRINUSE: pass # in use, no need else: raise # a real error @@ -108,7 +108,7 @@ pass def get_request(self): - r, w, e = select.select([self.socket], [], [], self.timeout) + r,w,e = select.select([self.socket], [], [], self.timeout) if r: return self.socket.accept() else: @@ -119,16 +119,15 @@ switches = self.rfile.readline() body = self.rfile.read() try: - response = self._calc_response(switches, body) + response = self._calc_response(switches,body) self.wfile.write('0\n%d\n'%(len(response),)) self.wfile.write(response) except: - response = traceback.format_exception_only(sys.exc_info()[0], - sys.exc_info()[1])[0] + response = traceback.format_exception_only(sys.exc_info()[0],sys.exc_info()[1])[0] self.wfile.write('1\n%d\n'%(len(response),)) self.wfile.write(response) - def _calc_response(self, switches, body): + def _calc_response(self,switches,body): switches = switches.split() actions = [] opts, args = getopt.getopt(switches, 'fgstGS') Modified: trunk/spambayes/scripts/sb_client.py =================================================================== --- trunk/spambayes/scripts/sb_client.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_client.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -10,7 +10,7 @@ import xmlrpclib import sys -RPCBASE = "http://localhost:65000" +RPCBASE="http://localhost:65000" def main(): msg = sys.stdin.read() Modified: trunk/spambayes/scripts/sb_dbexpimp.py =================================================================== --- trunk/spambayes/scripts/sb_dbexpimp.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_dbexpimp.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -101,7 +101,8 @@ import spambayes.storage from spambayes.Options import options -import sys, os, getopt, errno +import sys, os, getopt, errno, re +import urllib from types import UnicodeType def uquote(s): @@ -136,8 +137,8 @@ writer = csv.writer(fp) - nham = bayes.nham - nspam = bayes.nspam + nham = bayes.nham; + nspam = bayes.nspam; print "Exporting database %s to file %s" % (dbFN, outFN) print "Database has %s ham, %s spam, and %s words" \ @@ -214,7 +215,7 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'iehmvd:p:f:o:') except getopt.error, msg: - print >> sys.stderr, str(msg) + '\n\n' + __doc__ + print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() useDBM = "pickle" @@ -226,7 +227,7 @@ for opt, arg in opts: if opt == '-h': - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ sys.exit() elif opt == '-f': flatFN = arg @@ -246,4 +247,4 @@ if imp: runImport(dbFN, useDBM, newDBM, flatFN) else: - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ Modified: trunk/spambayes/scripts/sb_imapfilter.py =================================================================== --- trunk/spambayes/scripts/sb_imapfilter.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_imapfilter.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -110,7 +110,7 @@ else: temp_dir = win32api.GetTempPath() status = "Log file opened in " + temp_dir - for i in range(3, 0, -1): + for i in range(3,0,-1): try: os.unlink(os.path.join(temp_dir, "SpamBayesIMAP%d.log" % (i+1))) except os.error: @@ -132,15 +132,20 @@ import getopt import types import thread +import traceback import email import email.Parser from getpass import getpass from email.Utils import parsedate +try: + import cStringIO as StringIO +except ImportError: + import StringIO from spambayes import Stats from spambayes import message -from spambayes.Options import options, optionsPathname -from spambayes import storage, Dibbler +from spambayes.Options import options, get_pathname_option, optionsPathname +from spambayes import tokenizer, storage, Dibbler from spambayes.UserInterface import UserInterfaceServer from spambayes.ImapUI import IMAPUserInterface, LoginFailure @@ -172,6 +177,7 @@ timeout = 60 # seconds def __init__(self, server, debug=0, do_expunge = options["imap", "expunge"] ): + if server.find(':') > -1: server, port = server.split(':', 1) port = int(port) @@ -488,7 +494,7 @@ class IMAPMessage(message.SBHeaderMessage): def __init__(self): - message.SBHeaderMessage.__init__(self) + message.Message.__init__(self) self.folder = None self.previous_folder = None self.rfc822_command = "(BODY.PEEK[])" @@ -542,7 +548,7 @@ # Can't select the folder, so getting the substance will not # work. self.could_not_retrieve = True - print >> sys.stderr, "Could not select folder %s for message " \ + print >>sys.stderr, "Could not select folder %s for message " \ "%s (uid %s)" % (self.folder.name, self.id, self.uid) return self @@ -565,7 +571,7 @@ # characters for classification. For now, we just carry on, # warning the user and ignoring the message. self.could_not_retrieve = True - print >> sys.stderr, "MemoryError with message %s (uid %s)" % \ + print >>sys.stderr, "MemoryError with message %s (uid %s)" % \ (self.id, self.uid) return self @@ -608,7 +614,7 @@ self.got_substance = True # Print the exception and a traceback. - print >> sys.stderr, details + print >>sys.stderr, details return self @@ -654,7 +660,7 @@ We can't actually update the message with IMAP, so what we do is create a new message and delete the old one.""" - assert self.folder is not None, \ + assert self.folder is not None,\ "Can't save a message that doesn't have a folder." assert self.id, "Can't save a message that doesn't have an id." assert self.imap_server, "Can't do anything without IMAP connection." @@ -727,8 +733,7 @@ data = self.imap_server.check_response("recent", response) if data[0] is not None: if options["globals", "verbose"]: - print "[imapfilter] found saved message", self.uid, - print "in iteration", i + print "[imapfilter] found saved message %s in iteration" % self.uid, i break else: if options["globals", "verbose"]: @@ -958,7 +963,7 @@ cls = msg.GetClassification() if cls is None or hamfolder is not None: if options["globals", "verbose"]: - print "[imapfilter] classified as %s:" % cls, msg.uid + print "[imapfilter] classified as %s:"%cls, msg.uid msg = msg.get_full_message() if msg.could_not_retrieve: @@ -1135,13 +1140,13 @@ for u in usernames: pwds.append(getpass("Enter password for %s:" % (u,))) - return zip(servers, usernames, pwds) + return zip(servers,usernames,pwds) def run(force_UI=False): try: opts, args = getopt.getopt(sys.argv[1:], 'hbPtcvl:e:i:d:p:o:') except getopt.error, msg: - print >> sys.stderr, str(msg) + '\n\n' + __doc__ + print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() doTrain = False @@ -1154,7 +1159,7 @@ for opt, arg in opts: if opt == '-h': - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ sys.exit() elif opt == "-b": launchUI = True @@ -1243,7 +1248,7 @@ IMAPSession, stats=stats, close_db=close_db, change_db=change_db)) - launchBrowser = launchUI or options["html_ui", "launch_browser"] + launchBrowser=launchUI or options["html_ui", "launch_browser"] if sleepTime: # Run in a separate thread, as we have more work to do. thread.start_new_thread(Dibbler.run, (), Modified: trunk/spambayes/scripts/sb_mailsort.py =================================================================== --- trunk/spambayes/scripts/sb_mailsort.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_mailsort.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -25,7 +25,7 @@ import time import signal import socket -import errno +import email DB_FILE = os.path.expanduser(DB_FILE) @@ -65,7 +65,7 @@ try: fd = os.open(pathname, os.O_WRONLY|os.O_CREAT|os.O_EXCL, 0600) except IOError, exc: - if exc[0] not in (errno.EINT, errno.EEXIST): + if exc[i] not in (errno.EINT, errno.EEXIST): raise else: break @@ -142,7 +142,7 @@ prob, evidence = bayes.spamprob(tokenize(msg), evidence=True) print msg_name, prob for word, prob in evidence: - print ' ', repr(word), prob + print ' ', `word`, prob def main(): global DB_FILE, CONFIG_FILE Modified: trunk/spambayes/scripts/sb_mboxtrain.py =================================================================== --- trunk/spambayes/scripts/sb_mboxtrain.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_mboxtrain.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -111,8 +111,7 @@ def maildir_train(h, path, is_spam, force, removetrained): """Train bayes with all messages from a maildir.""" - if loud: - print " Reading %s as Maildir" % (path,) + if loud: print " Reading %s as Maildir" % (path,) import time import socket @@ -163,8 +162,7 @@ def mbox_train(h, path, is_spam, force): """Train bayes with a Unix mbox""" - if loud: - print " Reading as Unix mbox" + if loud: print " Reading as Unix mbox" import mailbox import fcntl @@ -221,8 +219,7 @@ def mhdir_train(h, path, is_spam, force): """Train bayes with an mh directory""" - if loud: - print " Reading as MH mailbox" + if loud: print " Reading as MH mailbox" import glob @@ -334,15 +331,13 @@ h = hammie.open(pck, usedb, "c") for g in good: - if loud: - print "Training ham (%s):" % g + if loud: print "Training ham (%s):" % g train(h, g, False, force, trainnew, removetrained) sys.stdout.flush() save = True for s in spam: - if loud: - print "Training spam (%s):" % s + if loud: print "Training spam (%s):" % s train(h, s, True, force, trainnew, removetrained) sys.stdout.flush() save = True Modified: trunk/spambayes/scripts/sb_notesfilter.py =================================================================== --- trunk/spambayes/scripts/sb_notesfilter.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_notesfilter.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -144,15 +144,14 @@ return not not val import sys +from spambayes import tokenizer, storage +from spambayes.Options import options +import cPickle as pickle import errno -import getopt - import win32com.client import pywintypes +import getopt -from spambayes import tokenizer, storage -from spambayes.Options import options -from spambayes.safepickle import pickle_read, pickle_write def classifyInbox(v, vmoveto, bayes, ldbname, notesindex, log): @@ -188,18 +187,20 @@ # probably due to this unicode problem. options["Tokenizer", "generate_long_skips"] = False tokens = tokenizer.tokenize(message) - prob = bayes.spamprob(tokens) + prob, clues = bayes.spamprob(tokens, evidence=True) if prob < options["Categorization", "ham_cutoff"]: + disposition = options["Headers", "header_ham_string"] numham += 1 elif prob > options["Categorization", "spam_cutoff"]: + disposition = options["Headers", "header_spam_string"] docstomove += [doc] numspam += 1 else: + disposition = options["Headers", "header_unsure_string"] numuns += 1 notesindex[nid] = 'classified' - subj = message["subject"] try: print "%s spamprob is %s" % (subj[:30], prob) if log: @@ -304,13 +305,16 @@ bayes = storage.open_storage(bdbname, useDBM) try: - notesindex = pickle_read(idxname) + fp = open(idxname, 'rb') except IOError, e: if e.errno != errno.ENOENT: raise notesindex = {} print "%s file not found, this is a first time run" % (idxname,) print "No classification will be performed" + else: + notesindex = pickle.load(fp) + fp.close() need_replicate = False @@ -374,7 +378,9 @@ bayes.store() - pickle_write(idxname, notesindex) + fp = open(idxname, 'wb') + pickle.dump(notesindex, fp) + fp.close() if log: log.LogAction("Finished running spambayes") @@ -384,7 +390,7 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'htcPd:p:l:r:f:o:i:W:L:') except getopt.error, msg: - print >> sys.stderr, str(msg) + '\n\n' + __doc__ + print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() ldbname = None # local notes database name @@ -399,7 +405,7 @@ for opt, arg in opts: if opt == '-h': - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ sys.exit() elif opt == '-l': ldbname = arg @@ -431,6 +437,9 @@ sbfname, doTrain, doClassify, pwd, idxname, logname) if doPrompt: - raw_input("Press Enter to end ") + try: + key = input("Press Enter to end") + except SyntaxError: + pass else: - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ Modified: trunk/spambayes/scripts/sb_pop3dnd.py =================================================================== --- trunk/spambayes/scripts/sb_pop3dnd.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_pop3dnd.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -67,11 +67,13 @@ import md5 import time import errno +import types import email import thread import getopt import socket import imaplib +import operator import email.Utils try: @@ -83,12 +85,14 @@ import twisted.application.app from twisted.internet import defer from twisted.internet import reactor +from twisted.internet.defer import maybeDeferred from twisted.internet.protocol import ServerFactory from twisted.protocols.imap4 import IMessage -from twisted.protocols.imap4 import IAccount -from twisted.protocols.imap4 import MessageSet +from twisted.protocols.imap4 import parseNestedParens, parseIdList +from twisted.protocols.imap4 import IllegalClientResponse, IAccount +from twisted.protocols.imap4 import collapseNestedLists, MessageSet from twisted.protocols.imap4 import IMAP4Server, MemoryAccount, IMailbox -from twisted.protocols.imap4 import IMailboxListener +from twisted.protocols.imap4 import IMailboxListener, collapseNestedLists from spambayes import storage from spambayes import message @@ -97,7 +101,7 @@ from spambayes.tokenizer import tokenize from spambayes import FileCorpus, Dibbler from spambayes.Version import get_current_version -from sb_server import POP3ProxyBase, State, _addressPortStr +from sb_server import POP3ProxyBase, State, _addressPortStr, _recreateState def ensureDir(dirname): """Ensure that the given directory exists - in other words, if it @@ -534,7 +538,7 @@ class SpambayesInbox(SpambayesMailbox): """A special mailbox that holds status messages from SpamBayes.""" def __init__(self, id, state): - SpambayesMailbox.__init__(self, "INBOX", "spambayes", id) + IMAPMailbox.__init__(self, "INBOX", "spambayes", id) self.mdb = state.mdb self.UID_validity = id self.nextUID = 1 @@ -822,8 +826,8 @@ msg = email.message_from_string(messageText, _class=message.SBHeaderMessage) # Now find the spam disposition and add the header. - (prob, clues) = state.bayes.spamprob(msg.tokenize(), - evidence=True) + (prob, clues) = state.bayes.spamprob(msg.tokenize(),\ + evidence=True) # Note that the X-SpamBayes-MailID header will be worthless # because we don't know the message id at this point. It's @@ -866,7 +870,7 @@ message.insert_exception_header(messageText) # Print the exception and a traceback. - print >> sys.stderr, details + print >>sys.stderr, details retval = ok + "\n" + messageText if terminatingDotPresent: retval += '.\r\n' @@ -1005,12 +1009,12 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'ho:') except getopt.error, msg: - print >> sys.stderr, str(msg) + '\n\n' + __doc__ + print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() for opt, arg in opts: if opt == '-h': - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ sys.exit() elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) Modified: trunk/spambayes/scripts/sb_server.py =================================================================== --- trunk/spambayes/scripts/sb_server.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_server.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -102,15 +102,16 @@ o NNTP proxy. """ -import sys, re, getopt, time, socket, email +import os, sys, re, errno, getopt, time, traceback, socket, cStringIO, email from thread import start_new_thread +from email.Header import Header import spambayes.message from spambayes import i18n from spambayes import Stats from spambayes import Dibbler from spambayes import storage -from spambayes.FileCorpus import ExpiryFileCorpus +from spambayes.FileCorpus import FileCorpus, ExpiryFileCorpus from spambayes.FileCorpus import FileMessageFactory, GzipFileMessageFactory from spambayes.Options import options, get_pathname_option, _ from spambayes.UserInterface import UserInterfaceServer @@ -190,7 +191,7 @@ except socket.sslerror, why: if why[0] == 1: # error:140770FC:SSL routines:SSL23_GET_SERVER_HELLO:unknown protocol' # Probably not SSL after all. - print >> sys.stderr, "Can't use SSL" + print >>sys.stderr, "Can't use SSL" else: raise else: @@ -366,7 +367,8 @@ raise SystemExit elif verb == 'CRASH': # For testing - raise ZeroDivisionError + x = 0 + y = 1/x self.serverSocket.push(self.request + '\r\n') if self.request.strip() == '': @@ -566,8 +568,8 @@ _class=spambayes.message.SBHeaderMessage) msg.setId(state.getNewMessageName()) # Now find the spam disposition and add the header. - (prob, clues) = state.bayes.spamprob(msg.tokenize(), - evidence=True) + (prob, clues) = state.bayes.spamprob(msg.tokenize(),\ + evidence=True) msg.addSBHeaders(prob, clues) @@ -630,7 +632,7 @@ insert_exception_header(messageText) # Print the exception and a traceback. - print >> sys.stderr, details + print >>sys.stderr, details # Restore the +OK and the POP3 .\r\n terminator if there was one. retval = ok + "\n" + messageText @@ -834,6 +836,7 @@ nham = self.bayes.nham if nspam > 10 and nham > 10: db_ratio = nham/float(nspam) + big = small = None if db_ratio > 5.0: self.warning = _("Warning: you have much more ham than " \ "spam - SpamBayes works best with " \ @@ -985,6 +988,8 @@ proxyListeners.append(listener) def _recreateState(): + global state + # Close the existing listeners and create new ones. This won't # affect any running proxies - once a listener has created a proxy, # that proxy is then independent of it. @@ -1052,12 +1057,13 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'hbd:p:l:u:o:') except getopt.error, msg: - print >> sys.stderr, str(msg) + '\n\n' + __doc__ + print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() + runSelfTest = False for opt, arg in opts: if opt == '-h': - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ sys.exit() elif opt == '-b': state.launchUI = True @@ -1090,14 +1096,14 @@ try: prepare() except AlreadyRunningException: - print >> sys.stderr, \ + print >>sys.stderr, \ "ERROR: The proxy is already running on this machine." - print >> sys.stderr, "Please stop the existing proxy and try again" + print >>sys.stderr, "Please stop the existing proxy and try again" return start() else: - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ if __name__ == '__main__': run() Modified: trunk/spambayes/scripts/sb_upload.py =================================================================== --- trunk/spambayes/scripts/sb_upload.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_upload.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -58,7 +58,7 @@ h.putheader('content-length', str(len(body))) h.endheaders() h.send(body) - h.getreply() + errcode, errmsg, headers = h.getreply() return h.file.read() def encode_multipart_formdata(fields, files): @@ -153,7 +153,7 @@ ("text", "")], [("file", "message.dat", data)]) else: - post_multipart("%s:%d" % (server, port), "/upload", [], + post_multipart("%s:%d" % (server,port), "/upload", [], [('file', 'message.dat', data)]) except: # not an error if the server isn't responding Modified: trunk/spambayes/scripts/sb_xmlrpcserver.py =================================================================== --- trunk/spambayes/scripts/sb_xmlrpcserver.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/scripts/sb_xmlrpcserver.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -22,8 +22,10 @@ Port number to listen to. """ +import os import getopt import sys +import traceback import xmlrpclib import SimpleXMLRPCServer Modified: trunk/spambayes/setup.py =================================================================== --- trunk/spambayes/setup.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/setup.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -1,10 +1,8 @@ #!/usr/bin/env python import os -import sys -from setuptools import setup, find_packages - +import sys if sys.version < '2.2': print "Error: Python version too old. You need at least Python 2.2 to use this package." print "(you're running version %s)"%sys.version @@ -123,9 +121,6 @@ author = "the spambayes project", author_email = "spambayes at python.org", url = "http://spambayes.sourceforge.net", - install_requires = ["lockfile>=0.2", - "pydns>=2.0"], -## packages=find_packages("spambayes"), cmdclass = {'install_scripts': install_scripts, 'sdist': sdist, }, Modified: trunk/spambayes/spambayes/ImapUI.py =================================================================== --- trunk/spambayes/spambayes/ImapUI.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/spambayes/ImapUI.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -43,7 +43,7 @@ import cgi -from spambayes import UserInterface +import UserInterface from spambayes.Options import options, optionsPathname, _ # These are the options that will be offered on the configuration page. Modified: trunk/spambayes/spambayes/Options.py =================================================================== --- trunk/spambayes/spambayes/Options.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/spambayes/Options.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -33,7 +33,7 @@ __all__ = ['options', '_'] # Grab the stuff from the core options class. -from spambayes.OptionsClass import * +from OptionsClass import * # A little magic. We'd like to use ZODB as the default storage, # because we've had so many problems with bsddb, and we'd like to swap @@ -199,12 +199,6 @@ reasons if your corpora are from different sources."""), BOOLEAN, RESTORE), - ("x-mine_nntp_headers", _("Mine NNTP-Posting-Host headers"), False, - _("""Usenet is host to a lot of spam. Usenet/Mailing list gateways - can let it leak across. Similar to mining received headers, we pick - apart the IP address or host name in this header for clues."""), - BOOLEAN, RESTORE), - ("address_headers", _("Address headers to mine"), ("from", "to", "cc", "sender", "reply-to"), _("""Mine the following address headers. If you have mixed source @@ -609,8 +603,8 @@ ("persistent_use_database", _("Database backend"), DB_TYPE[0], _("""SpamBayes can use either a ZODB or dbm database (quick to score one message) or a pickle (quick to train on huge amounts of messages). - There is also (experimental) ability to use a mySQL or PostgresSQL - database."""), + There is also (currently experimental) the ability to use a mySQL or + PostgrepSQL database."""), ("zeo", "zodb", "cdb", "mysql", "pgsql", "dbm", "pickle"), RESTORE), ("persistent_storage_file", _("Storage file name"), DB_TYPE[1], @@ -1366,7 +1360,7 @@ # in the current directory, and no win32 extensions installed # to locate the "user" directory - seeing things are so lamely # setup, it is worth printing a warning - print >> sys.stderr, "NOTE: We can not locate an INI file " \ + print >>sys.stderr, "NOTE: We can not locate an INI file " \ "for SpamBayes, and the Python for Windows extensions " \ "are not installed, meaning we can't locate your " \ "'user' directory. An empty configuration file at " \ Modified: trunk/spambayes/spambayes/Version.py =================================================================== --- trunk/spambayes/spambayes/Version.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/spambayes/Version.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -10,8 +10,8 @@ will generate the "ConfigParser" version for the web. """ -import sys -import re +import string, re +from types import StringType try: _ @@ -22,8 +22,8 @@ # A reason for why the spambayes.org URL fails is given in a comment there. #LATEST_VERSION_HOME="http://www.spambayes.org/download/Version.cfg" # The SF URL instead works for Tim and xenogeist. -LATEST_VERSION_HOME = "http://spambayes.sourceforge.net/download/Version.cfg" -DEFAULT_DOWNLOAD_PAGE = "http://spambayes.sourceforge.net/windows.html" +LATEST_VERSION_HOME="http://spambayes.sourceforge.net/download/Version.cfg" +DEFAULT_DOWNLOAD_PAGE="http://spambayes.sourceforge.net/windows.html" # This module is part of the spambayes project, which is Copyright 2002-2007 # The Python Software Foundation and is covered by the Python Software @@ -66,7 +66,7 @@ # and massage it into a string format that will compare properly # in update checks. try: - float(version) + ver_num = float(version) # Version converted successfully to a float, which means it # may be an old-format version number. Old convention was to # use 1.01 to represent "1.0.1", so check to see if there is @@ -86,8 +86,7 @@ def get_download_page(app = None, version_dict = None): - if version_dict is None: - version_dict = versions + if version_dict is None: version_dict = versions dict = version_dict # default to top level dictionary if app is not None: # attempt to get a sub-dict for the specific app @@ -186,21 +185,21 @@ releaselevel = "final" serial = 0 else: - serial = int(prerelease_num) + serial = string.atoi(prerelease_num) if prerelease == "a": releaselevel = "alpha" elif prerelease == "b": releaselevel = "beta" elif prerelease == "rc": releaselevel = "candidate" - self.version_info = tuple(map(int, [major, minor, patch]) + \ + self.version_info = tuple(map(string.atoi, [major, minor, patch]) + \ [releaselevel, serial]) def __str__(self): if self.version_info[2] == 0: - vstring = '.'.join(map(str, self.version_info[0:2])) + vstring = string.join(map(str, self.version_info[0:2]), '.') else: - vstring = '.'.join(map(str, self.version_info[0:3])) + vstring = string.join(map(str, self.version_info[0:3]), '.') releaselevel = self.version_info[3][0] if releaselevel != 'f': @@ -215,14 +214,13 @@ return vstring def __cmp__(self, other): - if isinstance(other, str): + if isinstance(other, StringType): other = SBVersion(other) return cmp(self.version_info, other.version_info) def get_long_version(self, app_name = None): - if app_name is None: - app_name = "SpamBayes" + if app_name is None: app_name = "SpamBayes" return _("%s Version %s (%s)") % (app_name, str(self), self.date) #============================================================================ @@ -270,7 +268,7 @@ ret_dict = {} apps_dict = ret_dict["Apps"] = {} for sect in cfg.sections(): - if sect == "SpamBayes": + if sect=="SpamBayes": target_dict = ret_dict else: target_dict = apps_dict.setdefault(sect, {}) @@ -350,6 +348,7 @@ _make_compatible_cfg_section(stream, appname, ver, versions["Apps"][appname]) def main(args): + import sys if '-g' in args: make_cfg(sys.stdout) sys.exit(0) @@ -371,5 +370,6 @@ print print "Latest version:", v_latest.get_long_version() -if __name__ == '__main__': +if __name__=='__main__': + import sys main(sys.argv) Modified: trunk/spambayes/spambayes/__init__.py =================================================================== --- trunk/spambayes/spambayes/__init__.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/spambayes/__init__.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -1,4 +1,4 @@ # package marker. -__version__ = "1.1b1" -__date__ = "November 23, 2008" +__version__ = "1.1a4" +__date__ = "June 25, 2007" Modified: trunk/spambayes/spambayes/chi2.py =================================================================== --- trunk/spambayes/spambayes/chi2.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/spambayes/chi2.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -1,5 +1,4 @@ import math as _math -import random try: True, False @@ -107,7 +106,7 @@ def __init__(self, baserandom=random.random, tabsize=513): self.baserandom = baserandom self.n = tabsize - self.tab = [baserandom() for _i in range(tabsize)] + self.tab = [baserandom() for i in range(tabsize)] self.next = baserandom() def random(self): @@ -151,8 +150,8 @@ s = Hist(20, lo=0.0, hi=1.0) score = Hist(20, lo=0.0, hi=1.0) - for _i in xrange(5000): - ps = [random() for _j in xrange(50)] + for i in range(5000): + ps = [random() for j in range(50)] s1, h1, score1 = judge(ps + [bias] * warp) s.add(s1) h.add(h1) @@ -204,4 +203,5 @@ print "(S-H+1)/2", score if __name__ == '__main__': + import random main() Modified: trunk/spambayes/spambayes/optimize.py =================================================================== --- trunk/spambayes/spambayes/optimize.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/spambayes/optimize.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -4,8 +4,6 @@ # Optimize any parametric function. # import copy - -# XXX Numeric is obsolete. Replace with numpy. import Numeric def SimplexMaximize(var, err, func, convcrit = 0.001, minerr = 0.001): @@ -32,7 +30,7 @@ if abs(value[bi] - value[wi]) <= convcrit: return simplex[bi] # Calculate average of non-worst - ave = Numeric.zeros(len(var), 'd') + ave=Numeric.zeros(len(var), 'd') for i in range(len(simplex)): if i != wi: ave = ave + simplex[i] Modified: trunk/spambayes/spambayes/storage.py =================================================================== --- trunk/spambayes/spambayes/storage.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/spambayes/storage.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -50,8 +50,8 @@ ### situations prints to sys.stdout will garble the message (e.g., in ### hammiefilter). -__author__ = ("Neale Pickett ," - "Tim Stone ") +__author__ = "Neale Pickett , \ +Tim Stone " __credits__ = "All the spambayes contributors." try: @@ -69,11 +69,11 @@ import tempfile from spambayes import classifier from spambayes.Options import options, get_pathname_option +import cPickle as pickle import errno import shelve from spambayes import cdb from spambayes import dbmstorage -from spambayes.safepickle import pickle_write # Make shelve use binary pickles by default. oldShelvePickler = shelve.Pickler @@ -85,6 +85,36 @@ NO_UPDATEPROBS = False # Probabilities will not be autoupdated with training UPDATEPROBS = True # Probabilities will be autoupdated with training +def safe_pickle(filename, value, protocol=0): + '''Store value as a pickle without creating corruption''' + + # Be as defensive as possible. Always keep a safe copy. + tmp = filename + '.tmp' + fp = None + try: + fp = open(tmp, 'wb') + pickle.dump(value, fp, protocol) + fp.close() + except IOError, e: + if options["globals", "verbose"]: + print >> sys.stderr, 'Failed update: ' + str(e) + if fp is not None: + os.remove(tmp) + raise + try: + # With *nix we can just rename, and (as long as permissions + # are correct) the old file will vanish. With win32, this + # won't work - the Python help says that there may not be + # a way to do an atomic replace, so we rename the old one, + # put the new one there, and then delete the old one. If + # something goes wrong, there is at least a copy of the old + # one. + os.rename(tmp, filename) + except OSError: + os.rename(filename, filename + '.bak') + os.rename(tmp, filename) + os.remove(filename + '.bak') + class PickledClassifier(classifier.Classifier): '''Classifier object persisted in a pickle''' @@ -106,12 +136,16 @@ # tempbayes object is reclaimed when load() returns. if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from', self.db_name, 'pickle' + print >> sys.stderr, 'Loading state from',self.db_name,'pickle' + tempbayes = None try: - tempbayes = pickle_read(self.db_name) - except: - tempbayes = None + fp = open(self.db_name, 'rb') + except IOError, e: + if e.errno != errno.ENOENT: raise + else: + tempbayes = pickle.load(fp) + fp.close() if tempbayes: # Copy state from tempbayes. The use of our base-class @@ -135,9 +169,9 @@ '''Store self as a pickle''' if options["globals", "verbose"]: - print >> sys.stderr, 'Persisting', self.db_name, 'as a pickle' + print >> sys.stderr, 'Persisting',self.db_name,'as a pickle' - pickle_write(self.db_name, self, PICKLE_TYPE) + safe_pickle(self.db_name, self, PICKLE_TYPE) def close(self): # we keep no resources open - nothing to do @@ -164,8 +198,7 @@ def close(self): # Close our underlying database. Better not assume all databases # have close functions! - def noop(): - pass + def noop(): pass getattr(self.db, "close", noop)() getattr(self.dbm, "close", noop)() # should not be a need to drop the 'dbm' or 'db' attributes. @@ -177,13 +210,13 @@ if hasattr(self, "dbm"): del self.dbm if options["globals", "verbose"]: - print >> sys.stderr, 'Closed', self.db_name, 'database' + print >> sys.stderr, 'Closed',self.db_name,'database' def load(self): '''Load state from database''' if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from', self.db_name, 'database' + print >> sys.stderr, 'Loading state from',self.db_name,'database' self.dbm = dbmstorage.open(self.db_name, self.mode) self.db = shelve.Shelf(self.dbm) @@ -211,8 +244,7 @@ '''Place state into persistent store''' if options["globals", "verbose"]: - print >> sys.stderr, 'Persisting', self.db_name, - print >> sys.stderr, 'state in database' + print >> sys.stderr, 'Persisting',self.db_name,'state in database' # Iterate over our changed word list. # This is *not* thread-safe - another thread changing our @@ -439,7 +471,7 @@ def fetchall(self, c): return c.dictfetchall() - def commit(self, _c): + def commit(self, c): self.db.commit() def load(self): @@ -448,7 +480,7 @@ import psycopg if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from', self.db_name, 'database' + print >> sys.stderr, 'Loading state from',self.db_name,'database' self.db = psycopg.connect('dbname=' + self.db_name) @@ -513,7 +545,7 @@ def fetchall(self, c): return c.fetchall() - def commit(self, _c): + def commit(self, c): self.db.commit() def load(self): @@ -522,7 +554,7 @@ import MySQLdb if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from', self.db_name, 'database' + print >> sys.stderr, 'Loading state from',self.db_name,'database' params = { 'host': self.host, 'db': self.db_name, @@ -692,11 +724,12 @@ object.__setattr__(self, att, value) def create_storage(self): + import ZODB from ZODB.FileStorage import FileStorage try: self.storage = FileStorage(self.db_filename, read_only=self.mode=='r') - except IOError: + except IOError, msg: print >> sys.stderr, ("Could not create FileStorage from", self.db_filename) raise @@ -735,6 +768,7 @@ def store(self): '''Place state into persistent store''' try: + import ZODB import ZODB.Transaction except ImportError: import transaction @@ -937,7 +971,7 @@ '''Untrain the database with the message''' if options["globals", "verbose"]: - print >> sys.stderr, 'untraining with', message.key() + print >> sys.stderr, 'untraining with',message.key() self.bayes.unlearn(message.tokenize(), self.is_spam) # self.updateprobs) @@ -971,7 +1005,6 @@ class NoSuchClassifierError(Exception): def __init__(self, invalid_name): - Exception.__init__(self, invalid_name) self.invalid_name = invalid_name def __str__(self): return repr(self.invalid_name) @@ -1055,7 +1088,7 @@ try: unused, unused, is_path = _storage_types[typ] except KeyError: - raise NoSuchClassifierError(typ) + raise NoSuchClassifierError(db_type) if is_path: nm = get_pathname_option(*default_name) else: @@ -1109,7 +1142,7 @@ try: os.mkdir(dirname) if options["globals", "verbose"]: - print >> sys.stderr, "Creating directory", dirname + print >>sys.stderr, "Creating directory", dirname except OSError, e: if e.errno != errno.EEXIST: raise Modified: trunk/spambayes/spambayes/tokenizer.py =================================================================== --- trunk/spambayes/spambayes/tokenizer.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/spambayes/tokenizer.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -14,8 +14,6 @@ import binascii import urlparse import urllib -import socket - try: # We have three possibilities for Set: # (a) With Python 2.2 and earlier, we use our compatsets class @@ -41,7 +39,7 @@ try: - from spambayes import dnscache + import dnscache cache = dnscache.cache(cachefile=options["Tokenizer", "lookup_ip_cache"]) cache.printStatsAtEnd = False except (IOError, ImportError): @@ -683,8 +681,6 @@ # by m19.grp.scd.yahoo.com with QMQP; 19 Dec 2003 04:06:53 -0000 received_ip_re = re.compile(r'[[(]((\d{1,3}\.?){4})[])]') -received_nntp_ip_re = re.compile(r'((\d{1,3}\.?){4})') - message_id_re = re.compile(r'\s*<[^@]+@([^>]+)>\s*') # I'm usually just splitting on whitespace, but for subject lines I want to @@ -1088,12 +1084,19 @@ scheme, netloc, path, params, query, frag = urlparse.urlparse(url) if cache is not None and options["Tokenizer", "x-lookup_ip"]: - ips = cache.lookup(netloc) + ips=cache.lookup(netloc) if not ips: pushclue("url-ip:lookup error") else: - for clue in gen_dotted_quad_clues("url-ip", ips): - pushclue(clue) + for ip in ips: # Should we limit to one A record? + pushclue("url-ip:%s/32" % ip) + dottedQuadList=ip.split(".") + pushclue("url-ip:%s/8" % dottedQuadList[0]) + pushclue("url-ip:%s.%s/16" % (dottedQuadList[0], + dottedQuadList[1])) + pushclue("url-ip:%s.%s.%s/24" % (dottedQuadList[0], + dottedQuadList[1], + dottedQuadList[2])) # one common technique in bogus "please (re-)authorize yourself" # scams is to make it appear as if you're visiting a valid @@ -1523,13 +1526,6 @@ for tok in breakdown(m.group(1)): yield 'received:' + tok - # Lots of spam gets posted on Usenet. If it is then gatewayed to a - # mailing list perhaps the NNTP-Posting-Host info will yield some - # useful clues. - if options["Tokenizer", "x-mine_nntp_headers"]: - for clue in mine_nntp(msg): - yield clue - # Message-Id: This seems to be a small win and should not # adversely affect a mixed source corpus so it's always enabled. msgid = msg.get("message-id", "") @@ -1702,56 +1698,5 @@ for t in self.tokenize_text(text): yield t -def mine_nntp(msg): - nntp_headers = msg.get_all("nntp-posting-host", ()) - yield "has-nntp:%s" % not not nntp_headers - for header in nntp_headers: - try: - address = header.split()[1] - except IndexError: - continue - if received_nntp_ip_re.match(address): - for clue in gen_dotted_quad_clues("nntp-host", [address]): - yield clue - try: - h = socket.gethostbyaddr(address) - except socket.herror: - yield 'nntp-host-ip:has-no-reverse' - else: - yield 'nntp-host-ip:has-reverse' - yield 'nntp-host-name:%s' % h[0] - yield ('nntp-host-domain:%s' % - '.'.join(h[0].split('.')[-2:])) - else: - # assume it's a hostname - name = address - yield 'nntp-host-name:%s' % name - yield ('nntp-host-domain:%s' % - '.'.join(name.split('.')[-2:])) - try: - address = socket.gethostbyname(name) - except socket.gaierror: - yield 'nntp-host-name:invalid' - else: - for clue in gen_dotted_quad_clues("nntp-host-ip", [address]): - yield clue - try: - h = socket.gethostbyaddr(address) - except socket.herror: - yield 'nntp-host-ip:has-no-reverse' - else: - yield 'nntp-host-ip:has-reverse' - -def gen_dotted_quad_clues(pfx, ips): - for ip in ips: - yield "%s:%s/32" % (pfx, ip) - dottedQuadList = ip.split(".") - yield "%s:%s/8" % (pfx, dottedQuadList[0]) - yield "%s:%s.%s/16" % (pfx, dottedQuadList[0], - dottedQuadList[1]) - yield "%s:%s.%s.%s/24" % (pfx, dottedQuadList[0], - dottedQuadList[1], - dottedQuadList[2]) - global_tokenizer = Tokenizer() tokenize = global_tokenizer.tokenize Modified: trunk/spambayes/testtools/es2hs.py =================================================================== --- trunk/spambayes/testtools/es2hs.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/testtools/es2hs.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -43,6 +43,8 @@ def main(): """Main program; parse options and go.""" + global loud + everything = None spam = [] @@ -69,8 +71,7 @@ spamsizes = {} for s in spam: - if loud: - print "Scanning spamdir (%s):" % s + if loud: print "Scanning spamdir (%s):" % s files = os.listdir(s) for f in files: if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): @@ -84,8 +85,7 @@ os.makedirs(spamdir) os.makedirs(hamdir) - if loud: - print "Scanning everything" + if loud: print "Scanning everything" for f in os.listdir(everything): if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): name = os.path.join(everything, f) Modified: trunk/spambayes/testtools/incremental.py =================================================================== --- trunk/spambayes/testtools/incremental.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/testtools/incremental.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -30,7 +30,7 @@ from spambayes import msgs import email from email import Message -from testtools import regimes +import regimes try: True, False @@ -76,10 +76,10 @@ # The number of test instances correctly and incorrectly classified. self.nham_right = 0 self.nham_wrong = 0 - self.nham_unsure = 0 + self.nham_unsure = 0; self.nspam_right = 0 self.nspam_wrong = 0 - self.nspam_unsure = 0 + self.nspam_unsure = 0; # Lists of bad predictions. self.ham_wrong_examples = [] # False positives: ham called spam. Modified: trunk/spambayes/utilities/HistToGNU.py =================================================================== --- trunk/spambayes/utilities/HistToGNU.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/HistToGNU.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -18,13 +18,14 @@ set xrange [0.0:100.0] """ -dataSetOptions = "smooth unique" +dataSetOptions="smooth unique" +from spambayes.Options import options +from spambayes.TestDriver import Hist + import sys +import cPickle as pickle -from spambayes.Options import options -from spambayes.safepickle import pickle_read, pickle_write - program = sys.argv[0] def usage(code, msg=''): @@ -37,7 +38,7 @@ def loadHist(path): """Load the histogram pickle object""" - return pickle_read(path) + return pickle.load(file(path)) def outputHist(hist, f=sys.stdout): """Output the Hist object to file f""" @@ -48,7 +49,7 @@ def plot(files): """given a list of files, create gnu-plot file""" - import cStringIO + import cStringIO, os cmd = cStringIO.StringIO() cmd.write(globalOptions) args = [] Modified: trunk/spambayes/utilities/convert_config_file.py =================================================================== --- trunk/spambayes/utilities/convert_config_file.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/convert_config_file.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -45,7 +45,7 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'vhf:') except getopt.error, msg: - print >> sys.stderr, str(msg) + '\n\n' + __doc__ + print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() filename = "bayescustomize.ini" @@ -53,7 +53,7 @@ for opt, arg in opts: if opt == '-h': - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ sys.exit() elif opt == '-f': filename = arg @@ -64,6 +64,7 @@ if verbose: print "Loading defaults" o.load_defaults() + alts = [] if verbose: print "Updating file:", filename if os.path.exists(filename): Modified: trunk/spambayes/utilities/convert_db.py =================================================================== --- trunk/spambayes/utilities/convert_db.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/convert_db.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -41,13 +41,13 @@ try: opts, args = getopt.getopt(sys.argv[1:], 'ht:T:n:N:') except getopt.error, msg: - print >> sys.stderr, str(msg) + '\n\n' + __doc__ + print >>sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() old_name = old_type = new_name = new_type = None for opt, arg in opts: if opt == '-h': - print >> sys.stderr, __doc__ + print >>sys.stderr, __doc__ sys.exit() elif opt == '-t': old_type = arg Modified: trunk/spambayes/utilities/extractmessages.py =================================================================== --- trunk/spambayes/utilities/extractmessages.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/extractmessages.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -23,11 +23,11 @@ import sys import getopt import re +import cPickle as pickle import locale from email.Header import make_header, decode_header from spambayes.mboxutils import getmbox -from spambayes.safepickle import pickle_read, pickle_write prog = sys.argv[0] @@ -115,7 +115,7 @@ return 1 try: - mapd = pickle_read(mapfile) + mapd = pickle.load(file(mapfile)) except IOError: usage("Mapfile %s does not exist" % mapfile) return 1 Modified: trunk/spambayes/utilities/hammer.py =================================================================== --- trunk/spambayes/utilities/hammer.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/hammer.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -3,7 +3,7 @@ # Part of the SpamBayes project. Released under the Python Software # Foundation license; see http://www.python.org/ -import os, re, random, textwrap +import os, sys, re, random, textwrap from spambayes import storage from spambayes import tokenizer Modified: trunk/spambayes/utilities/loosecksum.py =================================================================== --- trunk/spambayes/utilities/loosecksum.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/loosecksum.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -26,8 +26,10 @@ import getopt import sys +import email.Parser import md5 import re +import time import binascii from spambayes.mboxutils import getmbox Modified: trunk/spambayes/utilities/mboxcount.py =================================================================== --- trunk/spambayes/utilities/mboxcount.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/mboxcount.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -29,6 +29,7 @@ import sys import mailbox +import email import getopt import glob Modified: trunk/spambayes/utilities/mkreversemap.py =================================================================== --- trunk/spambayes/utilities/mkreversemap.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/mkreversemap.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -17,12 +17,13 @@ import sys import getopt +import anydbm +import cPickle as pickle from spambayes.mboxutils import getmbox from spambayes.tokenizer import tokenize from spambayes.Options import options from spambayes.classifier import Classifier -from spambayes.safepickle import pickle_read, pickle_write prog = sys.argv[0] @@ -98,13 +99,13 @@ return 1 try: - mapd = pickle_read(mapfile) + mapd = pickle.load(file(mapfile)) except IOError: mapd = {} for f in args: mapmessages(f, mboxtype, mapd) - pickle_write(mapfile, mapd) + pickle.dump(mapd, file(mapfile, "w")) if __name__ == "__main__": sys.exit(main(sys.argv[1:])) Modified: trunk/spambayes/utilities/split.py =================================================================== --- trunk/spambayes/utilities/split.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/split.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -28,6 +28,7 @@ import sys import random import mailbox +import email import getopt from spambayes import mboxutils Modified: trunk/spambayes/utilities/splitn.py =================================================================== --- trunk/spambayes/utilities/splitn.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/splitn.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -42,6 +42,7 @@ import sys import random import mailbox +import email import getopt from spambayes import mboxutils Modified: trunk/spambayes/utilities/splitndirs.py =================================================================== --- trunk/spambayes/utilities/splitndirs.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/utilities/splitndirs.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -47,6 +47,8 @@ import sys import os import random +import mailbox +import email import getopt import glob Modified: trunk/spambayes/windows/autoconfigure.py =================================================================== --- trunk/spambayes/windows/autoconfigure.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/windows/autoconfigure.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -496,6 +496,7 @@ results = [] for filename in os.listdir(config_location): if filename.lower().startswith("pop") or filename.lower().startswith("smt"): + full_filename = os.path.join(config_location, filename) working_filename = "%s.tmp" % (filename, ) shutil.copyfile(filename, working_filename) c = OptionsClass.OptionsClass() Modified: trunk/spambayes/windows/pop3proxy_service.py =================================================================== --- trunk/spambayes/windows/pop3proxy_service.py 2008-11-25 02:11:59 UTC (rev 3205) +++ trunk/spambayes/windows/pop3proxy_service.py 2008-11-25 03:29:21 UTC (rev 3206) @@ -138,7 +138,7 @@ try: # module imported by service manager, or 2.3 (in which __main__ # exists, *and* sys.argv[0] is always already absolute) - this_filename = __file__ + this_filename=__file__ except NameError: this_filename = sys.argv[0] if not os.path.isabs(sys.argv[0]): This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 05:09:59 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 04:09:59 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3215] trunk/spambayes/spambayes/storage.py Message-ID: Revision: 3215 http://spambayes.svn.sourceforge.net/spambayes/?rev=3215&view=rev Author: montanaro Date: 2008-11-25 04:09:59 +0000 (Tue, 25 Nov 2008) Log Message: ----------- use safepickle functions pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/storage.py Modified: trunk/spambayes/spambayes/storage.py =================================================================== --- trunk/spambayes/spambayes/storage.py 2008-11-25 04:01:34 UTC (rev 3214) +++ trunk/spambayes/spambayes/storage.py 2008-11-25 04:09:59 UTC (rev 3215) @@ -50,8 +50,8 @@ ### situations prints to sys.stdout will garble the message (e.g., in ### hammiefilter). -__author__ = "Neale Pickett , \ -Tim Stone " +__author__ = ("Neale Pickett ," + "Tim Stone ") __credits__ = "All the spambayes contributors." try: @@ -69,11 +69,11 @@ import tempfile from spambayes import classifier from spambayes.Options import options, get_pathname_option -import cPickle as pickle import errno import shelve from spambayes import cdb from spambayes import dbmstorage +from spambayes.safepickle import pickle_write # Make shelve use binary pickles by default. oldShelvePickler = shelve.Pickler @@ -85,36 +85,6 @@ NO_UPDATEPROBS = False # Probabilities will not be autoupdated with training UPDATEPROBS = True # Probabilities will be autoupdated with training -def safe_pickle(filename, value, protocol=0): - '''Store value as a pickle without creating corruption''' - - # Be as defensive as possible. Always keep a safe copy. - tmp = filename + '.tmp' - fp = None - try: - fp = open(tmp, 'wb') - pickle.dump(value, fp, protocol) - fp.close() - except IOError, e: - if options["globals", "verbose"]: - print >> sys.stderr, 'Failed update: ' + str(e) - if fp is not None: - os.remove(tmp) - raise - try: - # With *nix we can just rename, and (as long as permissions - # are correct) the old file will vanish. With win32, this - # won't work - the Python help says that there may not be - # a way to do an atomic replace, so we rename the old one, - # put the new one there, and then delete the old one. If - # something goes wrong, there is at least a copy of the old - # one. - os.rename(tmp, filename) - except OSError: - os.rename(filename, filename + '.bak') - os.rename(tmp, filename) - os.remove(filename + '.bak') - class PickledClassifier(classifier.Classifier): '''Classifier object persisted in a pickle''' @@ -136,16 +106,12 @@ # tempbayes object is reclaimed when load() returns. if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from',self.db_name,'pickle' + print >> sys.stderr, 'Loading state from', self.db_name, 'pickle' - tempbayes = None try: - fp = open(self.db_name, 'rb') - except IOError, e: - if e.errno != errno.ENOENT: raise - else: - tempbayes = pickle.load(fp) - fp.close() + tempbayes = pickle_read(self.db_name) + except: + tempbayes = None if tempbayes: # Copy state from tempbayes. The use of our base-class @@ -169,9 +135,9 @@ '''Store self as a pickle''' if options["globals", "verbose"]: - print >> sys.stderr, 'Persisting',self.db_name,'as a pickle' + print >> sys.stderr, 'Persisting', self.db_name, 'as a pickle' - safe_pickle(self.db_name, self, PICKLE_TYPE) + pickle_write(self.db_name, self, PICKLE_TYPE) def close(self): # we keep no resources open - nothing to do @@ -198,7 +164,8 @@ def close(self): # Close our underlying database. Better not assume all databases # have close functions! - def noop(): pass + def noop(): + pass getattr(self.db, "close", noop)() getattr(self.dbm, "close", noop)() # should not be a need to drop the 'dbm' or 'db' attributes. @@ -210,13 +177,13 @@ if hasattr(self, "dbm"): del self.dbm if options["globals", "verbose"]: - print >> sys.stderr, 'Closed',self.db_name,'database' + print >> sys.stderr, 'Closed', self.db_name, 'database' def load(self): '''Load state from database''' if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from',self.db_name,'database' + print >> sys.stderr, 'Loading state from', self.db_name, 'database' self.dbm = dbmstorage.open(self.db_name, self.mode) self.db = shelve.Shelf(self.dbm) @@ -244,7 +211,8 @@ '''Place state into persistent store''' if options["globals", "verbose"]: - print >> sys.stderr, 'Persisting',self.db_name,'state in database' + print >> sys.stderr, 'Persisting', self.db_name, + print >> sys.stderr, 'state in database' # Iterate over our changed word list. # This is *not* thread-safe - another thread changing our @@ -471,7 +439,7 @@ def fetchall(self, c): return c.dictfetchall() - def commit(self, c): + def commit(self, _c): self.db.commit() def load(self): @@ -480,7 +448,7 @@ import psycopg if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from',self.db_name,'database' + print >> sys.stderr, 'Loading state from', self.db_name, 'database' self.db = psycopg.connect('dbname=' + self.db_name) @@ -545,7 +513,7 @@ def fetchall(self, c): return c.fetchall() - def commit(self, c): + def commit(self, _c): self.db.commit() def load(self): @@ -554,7 +522,7 @@ import MySQLdb if options["globals", "verbose"]: - print >> sys.stderr, 'Loading state from',self.db_name,'database' + print >> sys.stderr, 'Loading state from', self.db_name, 'database' params = { 'host': self.host, 'db': self.db_name, @@ -724,12 +692,11 @@ object.__setattr__(self, att, value) def create_storage(self): - import ZODB from ZODB.FileStorage import FileStorage try: self.storage = FileStorage(self.db_filename, read_only=self.mode=='r') - except IOError, msg: + except IOError: print >> sys.stderr, ("Could not create FileStorage from", self.db_filename) raise @@ -768,7 +735,6 @@ def store(self): '''Place state into persistent store''' try: - import ZODB import ZODB.Transaction except ImportError: import transaction @@ -971,7 +937,7 @@ '''Untrain the database with the message''' if options["globals", "verbose"]: - print >> sys.stderr, 'untraining with',message.key() + print >> sys.stderr, 'untraining with', message.key() self.bayes.unlearn(message.tokenize(), self.is_spam) # self.updateprobs) @@ -1005,6 +971,7 @@ class NoSuchClassifierError(Exception): def __init__(self, invalid_name): + Exception.__init__(self, invalid_name) self.invalid_name = invalid_name def __str__(self): return repr(self.invalid_name) @@ -1088,7 +1055,7 @@ try: unused, unused, is_path = _storage_types[typ] except KeyError: - raise NoSuchClassifierError(db_type) + raise NoSuchClassifierError(typ) if is_path: nm = get_pathname_option(*default_name) else: @@ -1142,7 +1109,7 @@ try: os.mkdir(dirname) if options["globals", "verbose"]: - print >>sys.stderr, "Creating directory", dirname + print >> sys.stderr, "Creating directory", dirname except OSError, e: if e.errno != errno.EEXIST: raise This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 05:13:02 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 04:13:02 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3216] trunk/spambayes/spambayes/Version.py Message-ID: Revision: 3216 http://spambayes.svn.sourceforge.net/spambayes/?rev=3216&view=rev Author: montanaro Date: 2008-11-25 04:13:01 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/Version.py Modified: trunk/spambayes/spambayes/Version.py =================================================================== --- trunk/spambayes/spambayes/Version.py 2008-11-25 04:09:59 UTC (rev 3215) +++ trunk/spambayes/spambayes/Version.py 2008-11-25 04:13:01 UTC (rev 3216) @@ -10,8 +10,8 @@ will generate the "ConfigParser" version for the web. """ -import string, re -from types import StringType +import sys +import re try: _ @@ -22,8 +22,8 @@ # A reason for why the spambayes.org URL fails is given in a comment there. #LATEST_VERSION_HOME="http://www.spambayes.org/download/Version.cfg" # The SF URL instead works for Tim and xenogeist. -LATEST_VERSION_HOME="http://spambayes.sourceforge.net/download/Version.cfg" -DEFAULT_DOWNLOAD_PAGE="http://spambayes.sourceforge.net/windows.html" +LATEST_VERSION_HOME = "http://spambayes.sourceforge.net/download/Version.cfg" +DEFAULT_DOWNLOAD_PAGE = "http://spambayes.sourceforge.net/windows.html" # This module is part of the spambayes project, which is Copyright 2002-2007 # The Python Software Foundation and is covered by the Python Software @@ -66,7 +66,7 @@ # and massage it into a string format that will compare properly # in update checks. try: - ver_num = float(version) + float(version) # Version converted successfully to a float, which means it # may be an old-format version number. Old convention was to # use 1.01 to represent "1.0.1", so check to see if there is @@ -86,7 +86,8 @@ def get_download_page(app = None, version_dict = None): - if version_dict is None: version_dict = versions + if version_dict is None: + version_dict = versions dict = version_dict # default to top level dictionary if app is not None: # attempt to get a sub-dict for the specific app @@ -185,21 +186,21 @@ releaselevel = "final" serial = 0 else: - serial = string.atoi(prerelease_num) + serial = int(prerelease_num) if prerelease == "a": releaselevel = "alpha" elif prerelease == "b": releaselevel = "beta" elif prerelease == "rc": releaselevel = "candidate" - self.version_info = tuple(map(string.atoi, [major, minor, patch]) + \ + self.version_info = tuple(map(int, [major, minor, patch]) + \ [releaselevel, serial]) def __str__(self): if self.version_info[2] == 0: - vstring = string.join(map(str, self.version_info[0:2]), '.') + vstring = '.'.join(map(str, self.version_info[0:2])) else: - vstring = string.join(map(str, self.version_info[0:3]), '.') + vstring = '.'.join(map(str, self.version_info[0:3])) releaselevel = self.version_info[3][0] if releaselevel != 'f': @@ -214,13 +215,14 @@ return vstring def __cmp__(self, other): - if isinstance(other, StringType): + if isinstance(other, str): other = SBVersion(other) return cmp(self.version_info, other.version_info) def get_long_version(self, app_name = None): - if app_name is None: app_name = "SpamBayes" + if app_name is None: + app_name = "SpamBayes" return _("%s Version %s (%s)") % (app_name, str(self), self.date) #============================================================================ @@ -268,7 +270,7 @@ ret_dict = {} apps_dict = ret_dict["Apps"] = {} for sect in cfg.sections(): - if sect=="SpamBayes": + if sect == "SpamBayes": target_dict = ret_dict else: target_dict = apps_dict.setdefault(sect, {}) @@ -348,7 +350,6 @@ _make_compatible_cfg_section(stream, appname, ver, versions["Apps"][appname]) def main(args): - import sys if '-g' in args: make_cfg(sys.stdout) sys.exit(0) @@ -370,6 +371,5 @@ print print "Latest version:", v_latest.get_long_version() -if __name__=='__main__': - import sys +if __name__ == '__main__': main(sys.argv) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 05:13:53 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 04:13:53 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3217] trunk/spambayes/spambayes/ImapUI.py Message-ID: Revision: 3217 http://spambayes.svn.sourceforge.net/spambayes/?rev=3217&view=rev Author: montanaro Date: 2008-11-25 04:13:53 +0000 (Tue, 25 Nov 2008) Log Message: ----------- abs import (pylint) Modified Paths: -------------- trunk/spambayes/spambayes/ImapUI.py Modified: trunk/spambayes/spambayes/ImapUI.py =================================================================== --- trunk/spambayes/spambayes/ImapUI.py 2008-11-25 04:13:01 UTC (rev 3216) +++ trunk/spambayes/spambayes/ImapUI.py 2008-11-25 04:13:53 UTC (rev 3217) @@ -43,7 +43,7 @@ import cgi -import UserInterface +from spambayes import UserInterface from spambayes.Options import options, optionsPathname, _ # These are the options that will be offered on the configuration page. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 05:14:57 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 04:14:57 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3218] trunk/spambayes/spambayes/optimize.py Message-ID: Revision: 3218 http://spambayes.svn.sourceforge.net/spambayes/?rev=3218&view=rev Author: montanaro Date: 2008-11-25 04:14:57 +0000 (Tue, 25 Nov 2008) Log Message: ----------- Note obsoleteness of Numeric Modified Paths: -------------- trunk/spambayes/spambayes/optimize.py Modified: trunk/spambayes/spambayes/optimize.py =================================================================== --- trunk/spambayes/spambayes/optimize.py 2008-11-25 04:13:53 UTC (rev 3217) +++ trunk/spambayes/spambayes/optimize.py 2008-11-25 04:14:57 UTC (rev 3218) @@ -4,6 +4,8 @@ # Optimize any parametric function. # import copy + +# XXX Numeric is obsolete. Replace with numpy. import Numeric def SimplexMaximize(var, err, func, convcrit = 0.001, minerr = 0.001): @@ -30,7 +32,7 @@ if abs(value[bi] - value[wi]) <= convcrit: return simplex[bi] # Calculate average of non-worst - ave=Numeric.zeros(len(var), 'd') + ave = Numeric.zeros(len(var), 'd') for i in range(len(simplex)): if i != wi: ave = ave + simplex[i] This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 05:17:12 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 04:17:12 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3219] trunk/spambayes/spambayes/chi2.py Message-ID: Revision: 3219 http://spambayes.svn.sourceforge.net/spambayes/?rev=3219&view=rev Author: montanaro Date: 2008-11-25 04:17:12 +0000 (Tue, 25 Nov 2008) Log Message: ----------- pylint nits Modified Paths: -------------- trunk/spambayes/spambayes/chi2.py Modified: trunk/spambayes/spambayes/chi2.py =================================================================== --- trunk/spambayes/spambayes/chi2.py 2008-11-25 04:14:57 UTC (rev 3218) +++ trunk/spambayes/spambayes/chi2.py 2008-11-25 04:17:12 UTC (rev 3219) @@ -1,4 +1,5 @@ import math as _math +import random try: True, False @@ -106,7 +107,7 @@ def __init__(self, baserandom=random.random, tabsize=513): self.baserandom = baserandom self.n = tabsize - self.tab = [baserandom() for i in range(tabsize)] + self.tab = [baserandom() for _i in range(tabsize)] self.next = baserandom() def random(self): @@ -150,8 +151,8 @@ s = Hist(20, lo=0.0, hi=1.0) score = Hist(20, lo=0.0, hi=1.0) - for i in range(5000): - ps = [random() for j in range(50)] + for _i in xrange(5000): + ps = [random() for _j in xrange(50)] s1, h1, score1 = judge(ps + [bias] * warp) s.add(s1) h.add(h1) @@ -203,5 +204,4 @@ print "(S-H+1)/2", score if __name__ == '__main__': - import random main() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 16:33:35 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 15:33:35 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3220] trunk/spambayes/spambayes/tokenizer.py Message-ID: Revision: 3220 http://spambayes.svn.sourceforge.net/spambayes/?rev=3220&view=rev Author: montanaro Date: 2008-11-25 15:33:35 +0000 (Tue, 25 Nov 2008) Log Message: ----------- Mine NNTP-Posting-Host headers. This is part of an effort to put some SpamBayes smarts into the Mailman gate_news program. Anecdotal evidence on comp.lang.python suggests that certain posting hosts (I won't name any names, but the one mentioned heavily starts with a 'g'and has two 'o's in the middle) are more prone to let spam leak into the system. More testing is yet to be done, but this seems to be promising. Modified Paths: -------------- trunk/spambayes/spambayes/tokenizer.py Modified: trunk/spambayes/spambayes/tokenizer.py =================================================================== --- trunk/spambayes/spambayes/tokenizer.py 2008-11-25 04:17:12 UTC (rev 3219) +++ trunk/spambayes/spambayes/tokenizer.py 2008-11-25 15:33:35 UTC (rev 3220) @@ -39,7 +39,7 @@ try: - import dnscache + from spambayes import dnscache cache = dnscache.cache(cachefile=options["Tokenizer", "lookup_ip_cache"]) cache.printStatsAtEnd = False except (IOError, ImportError): @@ -681,6 +681,8 @@ # by m19.grp.scd.yahoo.com with QMQP; 19 Dec 2003 04:06:53 -0000 received_ip_re = re.compile(r'[[(]((\d{1,3}\.?){4})[])]') +received_nntp_ip_re = re.compile(r'((\d{1,3}\.?){4})') + message_id_re = re.compile(r'\s*<[^@]+@([^>]+)>\s*') # I'm usually just splitting on whitespace, but for subject lines I want to @@ -1084,19 +1086,12 @@ scheme, netloc, path, params, query, frag = urlparse.urlparse(url) if cache is not None and options["Tokenizer", "x-lookup_ip"]: - ips=cache.lookup(netloc) + ips = cache.lookup(netloc) if not ips: pushclue("url-ip:lookup error") else: - for ip in ips: # Should we limit to one A record? - pushclue("url-ip:%s/32" % ip) - dottedQuadList=ip.split(".") - pushclue("url-ip:%s/8" % dottedQuadList[0]) - pushclue("url-ip:%s.%s/16" % (dottedQuadList[0], - dottedQuadList[1])) - pushclue("url-ip:%s.%s.%s/24" % (dottedQuadList[0], - dottedQuadList[1], - dottedQuadList[2])) + for clue in gen_dotted_quad_clues("url-ip", ips): + pushclue(clue) # one common technique in bogus "please (re-)authorize yourself" # scams is to make it appear as if you're visiting a valid @@ -1526,6 +1521,13 @@ for tok in breakdown(m.group(1)): yield 'received:' + tok + # Lots of spam gets posted on Usenet. If it is then gatewayed to a + # mailing list perhaps the NNTP-Posting-Host info will yield some + # useful clues. + if options["Tokenizer", "x-mine_nntp_headers"]: + for clue in mine_nntp(msg): + yield clue + # Message-Id: This seems to be a small win and should not # adversely affect a mixed source corpus so it's always enabled. msgid = msg.get("message-id", "") @@ -1698,5 +1700,52 @@ for t in self.tokenize_text(text): yield t +# Mine NNTP-Posting-Host headers. This is part of an effort to put some +# SpamBayes smarts into the Mailman gate_news program. On mail.python.org +# messages arriving via Usenet bypass all the barriers the Python +# postmasters have erected against mail-borne spam, including not running +# them through SpamBayes. + +# Anecdotal evidence on comp.lang.python suggests that certain posting hosts +# (I won't name any names, but the one mentioned heavily starts with a +# 'g'and has two 'o's in the middle) are more prone to let spam leak into +# Usenet. My initial testing (also hardly more than anecdotal) suggests +# there are useful clues awaiting extractiotn from this header. +def mine_nntp(msg): + nntp_headers = msg.get_all("nntp-posting-host", ()) + for address in nntp_headers: + if received_nntp_ip_re.match(address): + for clue in gen_dotted_quad_clues("nntp-host", [address]): + yield clue + names = cache.lookup(address) + if names: + yield 'nntp-host-ip:has-reverse' + yield 'nntp-host-name:%s' % names[0] + yield ('nntp-host-domain:%s' % + '.'.join(names[0].split('.')[-2:])) + else: + # assume it's a hostname + name = address + yield 'nntp-host-name:%s' % name + yield ('nntp-host-domain:%s' % + '.'.join(name.split('.')[-2:])) + addresses = cache.lookup(name) + if addresses: + for clue in gen_dotted_quad_clues("nntp-host-ip", addresses): + yield clue + if cache.lookup(addresses[0], qType="PTR") == name: + yield 'nntp-host-ip:has-reverse' + +def gen_dotted_quad_clues(pfx, ips): + for ip in ips: + yield "%s:%s/32" % (pfx, ip) + dottedQuadList = ip.split(".") + yield "%s:%s/8" % (pfx, dottedQuadList[0]) + yield "%s:%s.%s/16" % (pfx, dottedQuadList[0], + dottedQuadList[1]) + yield "%s:%s.%s.%s/24" % (pfx, dottedQuadList[0], + dottedQuadList[1], + dottedQuadList[2]) + global_tokenizer = Tokenizer() tokenize = global_tokenizer.tokenize This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Tue Nov 25 16:34:18 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Tue, 25 Nov 2008 15:34:18 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3221] trunk/spambayes/spambayes/Options.py Message-ID: Revision: 3221 http://spambayes.svn.sourceforge.net/spambayes/?rev=3221&view=rev Author: montanaro Date: 2008-11-25 15:34:18 +0000 (Tue, 25 Nov 2008) Log Message: ----------- Add support for x-mine_nntp_headers. Modified Paths: -------------- trunk/spambayes/spambayes/Options.py Modified: trunk/spambayes/spambayes/Options.py =================================================================== --- trunk/spambayes/spambayes/Options.py 2008-11-25 15:33:35 UTC (rev 3220) +++ trunk/spambayes/spambayes/Options.py 2008-11-25 15:34:18 UTC (rev 3221) @@ -33,7 +33,7 @@ __all__ = ['options', '_'] # Grab the stuff from the core options class. -from OptionsClass import * +from spambayes.OptionsClass import * # A little magic. We'd like to use ZODB as the default storage, # because we've had so many problems with bsddb, and we'd like to swap @@ -199,6 +199,12 @@ reasons if your corpora are from different sources."""), BOOLEAN, RESTORE), + ("x-mine_nntp_headers", _("Mine NNTP-Posting-Host headers"), False, + _("""Usenet is host to a lot of spam. Usenet/Mailing list gateways + can let it leak across. Similar to mining received headers, we pick + apart the IP address or host name in this header for clues."""), + BOOLEAN, RESTORE), + ("address_headers", _("Address headers to mine"), ("from", "to", "cc", "sender", "reply-to"), _("""Mine the following address headers. If you have mixed source @@ -603,8 +609,8 @@ ("persistent_use_database", _("Database backend"), DB_TYPE[0], _("""SpamBayes can use either a ZODB or dbm database (quick to score one message) or a pickle (quick to train on huge amounts of messages). - There is also (currently experimental) the ability to use a mySQL or - PostgrepSQL database."""), + There is also (experimental) ability to use a mySQL or PostgresSQL + database."""), ("zeo", "zodb", "cdb", "mysql", "pgsql", "dbm", "pickle"), RESTORE), ("persistent_storage_file", _("Storage file name"), DB_TYPE[1], @@ -1360,7 +1366,7 @@ # in the current directory, and no win32 extensions installed # to locate the "user" directory - seeing things are so lamely # setup, it is worth printing a warning - print >>sys.stderr, "NOTE: We can not locate an INI file " \ + print >> sys.stderr, "NOTE: We can not locate an INI file " \ "for SpamBayes, and the Python for Windows extensions " \ "are not installed, meaning we can't locate your " \ "'user' directory. An empty configuration file at " \ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Fri Nov 28 16:45:44 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Fri, 28 Nov 2008 15:45:44 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3222] trunk/spambayes/utilities/splitndirs.py Message-ID: Revision: 3222 http://spambayes.svn.sourceforge.net/spambayes/?rev=3222&view=rev Author: montanaro Date: 2008-11-28 15:45:43 +0000 (Fri, 28 Nov 2008) Log Message: ----------- Add -d flag (skip duplicate messages). Modified Paths: -------------- trunk/spambayes/utilities/splitndirs.py Modified: trunk/spambayes/utilities/splitndirs.py =================================================================== --- trunk/spambayes/utilities/splitndirs.py 2008-11-25 15:34:18 UTC (rev 3221) +++ trunk/spambayes/utilities/splitndirs.py 2008-11-28 15:45:43 UTC (rev 3222) @@ -24,6 +24,8 @@ -n N The number of output mboxes desired. This is required. + -d Eliminate duplicates. + Arguments: sourcembox The mbox or path to an mbox to split. @@ -49,6 +51,10 @@ import random import getopt import glob +try: + from hashlib import md5 +except ImportError: + from md5 import new as md5 from spambayes import mboxutils @@ -69,13 +75,14 @@ def main(): try: - opts, args = getopt.getopt(sys.argv[1:], 'hgn:s:v', ['help']) + opts, args = getopt.getopt(sys.argv[1:], 'dhgn:s:v', ['help']) except getopt.error, msg: usage(1, msg) doglob = False n = None verbose = False + delete_dups = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) @@ -87,6 +94,8 @@ n = int(arg) elif opt == '-v': verbose = True + elif opt == '-d': + delete_dups = True if n is None or n <= 1: usage(1, "an -n value > 1 is required") @@ -101,6 +110,8 @@ os.makedirs(dir) counter = 0 + cksums = set() + skipped = 0 for inputpath in inputpaths: if doglob: inpaths = glob.glob(inputpath) @@ -110,8 +121,13 @@ for inpath in inpaths: mbox = mboxutils.getmbox(inpath) for msg in mbox: + astext = str(msg) + cksum = md5(astext).hexdigest() + if delete_dups and cksum in cksums: + skipped += 1 + continue + cksums.add(cksum) i = random.randrange(n) - astext = str(msg) #assert astext.endswith('\n') counter += 1 msgfile = open('%s/%d' % (outdirs[i], counter), 'wb') @@ -125,6 +141,8 @@ if verbose: print print counter, "messages split into", n, "directories" + if skipped: + print "skipped", skipped, "duplicate messages" if __name__ == '__main__': main() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. From montanaro at users.sourceforge.net Fri Nov 28 16:47:27 2008 From: montanaro at users.sourceforge.net (montanaro at users.sourceforge.net) Date: Fri, 28 Nov 2008 15:47:27 +0000 Subject: [Spambayes-checkins] SF.net SVN: spambayes:[3223] trunk/spambayes/setup.py Message-ID: Revision: 3223 http://spambayes.svn.sourceforge.net/spambayes/?rev=3223&view=rev Author: montanaro Date: 2008-11-28 15:47:22 +0000 (Fri, 28 Nov 2008) Log Message: ----------- Require lockfile and pydns, both available from PyPI. Modified Paths: -------------- trunk/spambayes/setup.py Modified: trunk/spambayes/setup.py =================================================================== --- trunk/spambayes/setup.py 2008-11-28 15:45:43 UTC (rev 3222) +++ trunk/spambayes/setup.py 2008-11-28 15:47:22 UTC (rev 3223) @@ -1,8 +1,10 @@ #!/usr/bin/env python import os +import sys -import sys +from setuptools import setup, find_packages + if sys.version < '2.2': print "Error: Python version too old. You need at least Python 2.2 to use this package." print "(you're running version %s)"%sys.version @@ -121,6 +123,8 @@ author = "the spambayes project", author_email = "spambayes at python.org", url = "http://spambayes.sourceforge.net", + install_requires = ["lockfile>=0.2", + "pydns>=2.0"], cmdclass = {'install_scripts': install_scripts, 'sdist': sdist, }, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.