From montanaro at users.sourceforge.net Sat Dec 2 23:09:28 2006 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sat, 02 Dec 2006 14:09:28 -0800 Subject: [Spambayes-checkins] spambayes/spambayes ImageStripper.py, 1.10, 1.11 Message-ID: <20061202220931.233D11E4010@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv7317 Modified Files: ImageStripper.py Log Message: Generate token when no text is detected. Index: ImageStripper.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/ImageStripper.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** ImageStripper.py 6 Nov 2006 14:50:30 -0000 1.10 --- ImageStripper.py 2 Dec 2006 22:09:25 -0000 1.11 *************** *** 192,198 **** ocr.close() ctokens = set() ! nlines = len(ctext.strip().split("\n")) ! if nlines: ! ctokens.add("image-text-lines:%d" % int(log2(nlines))) self.cache[fhash] = (ctext, ctokens) textbits.append(ctext) --- 192,204 ---- ocr.close() ctokens = set() ! if not ctext.strip(): ! # Lots of spam now contains images in which it is ! # difficult or impossible (using ocrad) to find any ! # text. Make a note of that. ! ctokens.add("image-text:no text found") ! else: ! nlines = len(ctext.strip().split("\n")) ! if nlines: ! ctokens.add("image-text-lines:%d" % int(log2(nlines))) self.cache[fhash] = (ctext, ctokens) textbits.append(ctext) From montanaro at users.sourceforge.net Wed Dec 13 15:44:51 2006 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Wed, 13 Dec 2006 06:44:51 -0800 Subject: [Spambayes-checkins] spambayes/spambayes tokenizer.py,1.45,1.46 Message-ID: <20061213144454.B8B031E4006@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv913 Modified Files: tokenizer.py Log Message: No need to print stats after every run at this point. Minor tweak for lookup failures. Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** tokenizer.py 10 Aug 2006 04:07:59 -0000 1.45 --- tokenizer.py 13 Dec 2006 14:44:49 -0000 1.46 *************** *** 43,47 **** import dnscache cache = dnscache.cache(cachefile=options["Tokenizer", "lookup_ip_cache"]) ! cache.printStatsAtEnd = True except (IOError, ImportError): cache = None --- 43,47 ---- import dnscache cache = dnscache.cache(cachefile=options["Tokenizer", "lookup_ip_cache"]) ! cache.printStatsAtEnd = False except (IOError, ImportError): cache = None *************** *** 1087,1092 **** if cache is not None and options["Tokenizer", "x-lookup_ip"]: ips=cache.lookup(netloc) ! if len(ips)==0: ! pushclue("url-ip:timeout") else: for ip in ips: # Should we limit to one A record? --- 1087,1092 ---- if cache is not None and options["Tokenizer", "x-lookup_ip"]: ips=cache.lookup(netloc) ! if not ips: ! pushclue("url-ip:lookup error") else: for ip in ips: # Should we limit to one A record? From kpitt at users.sourceforge.net Wed Dec 20 23:52:35 2006 From: kpitt at users.sourceforge.net (Kenny Pitt) Date: Wed, 20 Dec 2006 14:52:35 -0800 Subject: [Spambayes-checkins] spambayes/spambayes Stats.py,1.18,1.19 Message-ID: <20061220225237.B2A4D1E400B@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv28508/spambayes Modified Files: Stats.py Log Message: Delay calculation of persistent stats until they are needed for a GetStats() call. Index: Stats.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Stats.py,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -d -r1.18 -r1.19 *** Stats.py 28 Nov 2005 10:50:56 -0000 1.18 --- Stats.py 20 Dec 2006 22:51:18 -0000 1.19 *************** *** 57,63 **** # Reset session stats. self.Reset() ! # Load persistent stats. self.from_date = self.messageinfo_db.get_statistics_start_date() ! self.CalculatePersistentStats() def Reset(self): --- 57,63 ---- # Reset session stats. self.Reset() ! # Initialize persistent stats. self.from_date = self.messageinfo_db.get_statistics_start_date() ! self.persistentStatsCalculated = False def Reset(self): *************** *** 168,171 **** --- 168,181 ---- totals["num_trained_spam"] += 1 + # If we have already accumulated any session statistics then we need + # to subtract those from the totals to prevent double-counting. + totals["num_ham"] -= self.num_ham + totals["num_spam"] -= self.num_spam + totals["num_unsure"] -= self.num_unsure + totals["num_trained_ham"] -= self.num_trained_ham + totals["num_trained_ham_fp"] -= self.num_trained_ham_fp + totals["num_trained_spam"] -= self.num_trained_spam + totals["num_trained_spam_fn"] -= self.num_trained_spam_fn + def _CombineSessionAndTotal(self): totals = self.totals *************** *** 324,327 **** --- 334,341 ---- data["num_trained_spam_fn"] = self.num_trained_spam_fn else: + if not self.persistentStatsCalculated: + # Load persistent stats. + self.CalculatePersistentStats() + self.persistentStatsCalculated = True data = self._CombineSessionAndTotal() From mhammond at users.sourceforge.net Thu Dec 21 02:36:27 2006 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Wed, 20 Dec 2006 17:36:27 -0800 Subject: [Spambayes-checkins] website applications.ht, 1.31, 1.32 docs.ht, 1.20, 1.21 faq.txt, 1.91, 1.92 unix.ht, 1.11, 1.12 Message-ID: <20061221013629.F32DB1E4007@bag.python.org> Update of /cvsroot/spambayes/website In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv29704 Modified Files: applications.ht docs.ht faq.txt unix.ht Log Message: Fix some broken links Index: applications.ht =================================================================== RCS file: /cvsroot/spambayes/website/applications.ht,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** applications.ht 25 Nov 2004 06:36:21 -0000 1.31 --- applications.ht 21 Dec 2006 01:36:21 -0000 1.32 *************** *** 26,36 **** extensions (win32all-149 or later - currently ActivePython is not suitable) ! For more on this, see the README.txt or ! about.html file in the spambayes CVS repository's Outlook2000 directory.

Alternatively, you can use CVS to get the code - go to the CVS page on the project's sourceforge site for more.

sb_filter.py

sb_filter is a command line tool for marking mail as ham or spam. The readme ! includes a guide to integrating it with your mailer (Unix-only instructions at the moment - additions welcome!). --- 26,36 ---- extensions (win32all-149 or later - currently ActivePython is not suitable) ! For more on this, see the README.txt or ! about.html file in the spambayes CVS repository's Outlook2000 directory.

Alternatively, you can use CVS to get the code - go to the CVS page on the project's sourceforge site for more.

sb_filter.py

sb_filter is a command line tool for marking mail as ham or spam. The readme ! includes a guide to integrating it with your mailer (Unix-only instructions at the moment - additions welcome!). *************** *** 49,53 ****

sb_server.py

sb_server provides a POP3 proxy which sits between your mail client and your real POP3 server and marks ! mail as ham or spam as it passes through. See the README for more. sb_server can also be used with the sb_upload.py script as a procmail (or similar) solution.

--- 49,53 ----

sb_server.py

sb_server provides a POP3 proxy which sits between your mail client and your real POP3 server and marks ! mail as ham or spam as it passes through. See the README for more. sb_server can also be used with the sb_upload.py script as a procmail (or similar) solution.

*************** *** 69,73 ****

imapfilter connects to your imap server and marks mail as ham or spam, moving it to appropriate folders as it arrives. ! See the README for more.

Requirements

--- 69,73 ----

imapfilter connects to your imap server and marks mail as ham or spam, moving it to appropriate folders as it arrives. ! See the README for more.

Requirements

Index: docs.ht =================================================================== RCS file: /cvsroot/spambayes/website/docs.ht,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** docs.ht 7 Aug 2006 22:22:28 -0000 1.20 --- docs.ht 21 Dec 2006 01:36:21 -0000 1.21 *************** *** 12,16 **** your fancy added here.
  • Instructions on installing Spambayes and integrating it into your mail system.
  • !
  • The Outlook plugin includes an "About" File, and a "Troubleshooting Guide" that can be accessed via the toolbar. (Note that the online documentaton is always for the latest source version, and so might not correspond exactly with the version you are using. --- 12,16 ---- your fancy added here.
  • Instructions on installing Spambayes and integrating it into your mail system.
  • !
  • The Outlook plugin includes an "About" File, and a "Troubleshooting Guide" that can be accessed via the toolbar. (Note that the online documentaton is always for the latest source version, and so might not correspond exactly with the version you are using. Index: faq.txt =================================================================== RCS file: /cvsroot/spambayes/website/faq.txt,v retrieving revision 1.91 retrieving revision 1.92 diff -C2 -d -r1.91 -r1.92 *** faq.txt 10 Sep 2005 00:30:39 -0000 1.91 --- faq.txt 21 Dec 2006 01:36:21 -0000 1.92 *************** *** 68,72 **** ease-of-use. ! .. _the PSF license: http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/*checkout*/spambayes/spambayes/LICENSE.txt .. _I'm not a programmer but still want to help: #i-m-not-a-programmer-but-want-to-help-out-what-can-i-do .. _Python Software Foundation: http://www.python.org/psf/ --- 68,72 ---- ease-of-use. ! .. _the PSF license: http://spambayes.cvs.sourceforge.net/*checkout*/spambayes/spambayes/LICENSE.txt .. _I'm not a programmer but still want to help: #i-m-not-a-programmer-but-want-to-help-out-what-can-i-do .. _Python Software Foundation: http://www.python.org/psf/ *************** *** 531,535 **** up-to-date help for working around known problems. ! .. _troubleshooting guide: http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/*checkout*/spambayes/spambayes/Outlook2000/docs/troubleshooting.html?rev=HEAD&content-type=text/html --- 531,535 ---- up-to-date help for working around known problems. ! .. _troubleshooting guide: http://spambayes.cvs.sourceforge.net/*checkout*/spambayes/spambayes/Outlook2000/docs/troubleshooting.html?content-type=text/html *************** *** 1343,1347 **** ! .. _which_database.py: http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/*checkout*/spambayes/spambayes/utilities/which_database.py?rev=HEAD&content-type=text/plain .. _mailing list: mailto:spambayes at python.org --- 1343,1347 ---- ! .. _which_database.py: http://spambayes.cvs.sourceforge.net/*checkout*/spambayes/spambayes/utilities/which_database.py?content-type=text/plain .. _mailing list: mailto:spambayes at python.org Index: unix.ht =================================================================== RCS file: /cvsroot/spambayes/website/unix.ht,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** unix.ht 9 Aug 2004 06:21:15 -0000 1.11 --- unix.ht 21 Dec 2006 01:36:21 -0000 1.12 *************** *** 53,57 ****

    Additional details are available in the Hammie readme.

    --- 53,57 ----

    Additional details are available in the Hammie readme.

    *************** *** 209,213 ****

    See the Hammie readme for a detailed discussion of the many training options on Unix systems.

    --- 209,213 ----

    See the Hammie readme for a detailed discussion of the many training options on Unix systems.

    From mhammond at users.sourceforge.net Thu Dec 21 02:38:22 2006 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Wed, 20 Dec 2006 17:38:22 -0800 Subject: [Spambayes-checkins] website windows.ht,1.46,1.47 Message-ID: <20061221013825.04ECF1E401E@bag.python.org> Update of /cvsroot/spambayes/website In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv30785 Modified Files: windows.ht Log Message: More broken links Index: windows.ht =================================================================== RCS file: /cvsroot/spambayes/website/windows.ht,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** windows.ht 10 Sep 2005 00:32:28 -0000 1.46 --- windows.ht 21 Dec 2006 01:38:20 -0000 1.47 *************** *** 36,40 **** report if your problem seems to be a new one. Please be sure to go through the troubleshooting.html file that is installed with the plugin.

    --- 36,40 ---- report if your problem seems to be a new one. Please be sure to go through the troubleshooting.html file that is installed with the plugin.

    *************** *** 51,55 ****
  • Mark Hammond's pywin32 extensions. Choose the version which corresponds to the version of Python you downloaded.
  • --- 51,55 ----
  • Mark Hammond's pywin32 extensions. Choose the version which corresponds to the version of Python you downloaded.
  • From kpitt at users.sourceforge.net Thu Dec 28 16:36:46 2006 From: kpitt at users.sourceforge.net (Kenny Pitt) Date: Thu, 28 Dec 2006 07:36:46 -0800 Subject: [Spambayes-checkins] spambayes/spambayes Stats.py, 1.19, 1.20 message.py, 1.75, 1.76 Message-ID: <20061228153649.211301E4014@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv30845/spambayes Modified Files: Stats.py message.py Log Message: Recalculating persistent statistics from the message info db each session can be very time-consuming when the db gets large, so cache the persistent statistics in the message info db instead. If the message info db doesn't yet contain the persistent statistics then there will be a one-time startup hit to recalculate them, and after that startup will be significantly faster. Index: Stats.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Stats.py,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** Stats.py 20 Dec 2006 22:51:18 -0000 1.19 --- Stats.py 28 Dec 2006 15:36:44 -0000 1.20 *************** *** 43,47 **** import types ! from spambayes.message import STATS_START_KEY from spambayes.message import database_type, open_storage, Message --- 43,47 ---- import types ! from spambayes.message import STATS_START_KEY, STATS_STORAGE_KEY from spambayes.message import database_type, open_storage, Message *************** *** 59,63 **** # Initialize persistent stats. self.from_date = self.messageinfo_db.get_statistics_start_date() ! self.persistentStatsCalculated = False def Reset(self): --- 59,63 ---- # Initialize persistent stats. self.from_date = self.messageinfo_db.get_statistics_start_date() ! self.LoadPersistentStats() def Reset(self): *************** *** 76,88 **** --- 76,96 ---- self.from_date = time.time() self.messageinfo_db.set_statistics_start_date(self.from_date) + self.messageinfo_db.set_persistent_statistics(self.totals) def RecordClassification(self, score): """Record that a message has been classified this session.""" + totals = self.totals if score >= self.options["Categorization", "spam_cutoff"]: self.num_spam += 1 + totals["num_spam"] += 1 elif score >= self.options["Categorization", "ham_cutoff"]: self.num_unsure += 1 + totals["num_unsure"] += 1 else: self.num_ham += 1 + totals["num_ham"] += 1 + # We have to record the updated totals every time or else the + # persistent statistics will get out of sync. + self.messageinfo_db.set_persistent_statistics(self.totals) def RecordTraining(self, as_ham, old_score=None, old_class=None): *************** *** 97,102 **** --- 105,112 ---- # XXX Why, oh why, does this function have as_ham, when every # XXX other function has isSpam??? + totals = self.totals if as_ham: self.num_trained_ham += 1 + totals["num_trained_ham"] += 1 # If we are recovering an item that is in the "spam" threshold, # then record it as a "false positive" *************** *** 104,111 **** --- 114,124 ---- old_score > self.options["Categorization", "spam_cutoff"]: self.num_trained_ham_fp += 1 + totals["num_trained_ham_fp"] += 1 elif old_class == self.options["Headers", "header_spam_string"]: self.num_trained_ham_fp += 1 + totals["num_trained_ham_fp"] += 1 else: self.num_trained_spam += 1 + totals["num_trained_spam"] += 1 # If we are deleting as Spam an item that was in our "good" # range, then record it as a false negative. *************** *** 113,118 **** --- 126,147 ---- old_score < self.options["Categorization", "ham_cutoff"]: self.num_trained_spam_fn += 1 + totals["num_trained_spam_fn"] += 1 elif old_class == self.options["Headers", "header_ham_string"]: self.num_trained_spam_fn += 1 + totals["num_trained_spam_fn"] += 1 + # We have to record the updated totals every time or else the + # persistent statistics will get out of sync. + self.messageinfo_db.set_persistent_statistics(self.totals) + + def LoadPersistentStats(self): + """Load the persistent statistics from the messageinfo db. + + If the persistent statistics have not yet been stored in the db + then we need to recalculate them by iterating through all the + messages. This will result in a one-time performance hit, but + will greatly improve the startup time in the future.""" + self.totals = self.messageinfo_db.get_persistent_statistics() + if self.totals is None: + self.CalculatePersistentStats() def CalculatePersistentStats(self): *************** *** 122,134 **** adding up the various information. This could get quite time consuming if the messageinfo database gets very large, so ! some consideration should perhaps be made about what to do ! then. """ self.ResetTotal() totals = self.totals for msg_id in self.messageinfo_db.keys(): ! # Skip the date key. if msg_id == STATS_START_KEY: continue m = Message(msg_id) self.messageinfo_db.load_msg(m) --- 151,166 ---- adding up the various information. This could get quite time consuming if the messageinfo database gets very large, so ! it should only be done if the statistics start date is reset ! to an arbitrary point in the past. """ self.ResetTotal() totals = self.totals for msg_id in self.messageinfo_db.keys(): ! # Skip the date and persistent statistics keys. if msg_id == STATS_START_KEY: continue + if msg_id == STATS_STORAGE_KEY: + continue + m = Message(msg_id) self.messageinfo_db.load_msg(m) *************** *** 168,198 **** totals["num_trained_spam"] += 1 ! # If we have already accumulated any session statistics then we need ! # to subtract those from the totals to prevent double-counting. ! totals["num_ham"] -= self.num_ham ! totals["num_spam"] -= self.num_spam ! totals["num_unsure"] -= self.num_unsure ! totals["num_trained_ham"] -= self.num_trained_ham ! totals["num_trained_ham_fp"] -= self.num_trained_ham_fp ! totals["num_trained_spam"] -= self.num_trained_spam ! totals["num_trained_spam_fn"] -= self.num_trained_spam_fn ! ! def _CombineSessionAndTotal(self): ! totals = self.totals ! data = {} ! data["num_ham"] = self.num_ham + totals["num_ham"] ! data["num_spam"] = self.num_spam + totals["num_spam"] ! data["num_unsure"] = self.num_unsure + totals["num_unsure"] ! data["num_seen"] = data["num_ham"] + data["num_spam"] + \ ! data["num_unsure"] ! data["num_trained_ham"] = self.num_trained_ham + \ ! totals["num_trained_ham"] ! data["num_trained_ham_fp"] = self.num_trained_ham_fp + \ ! totals["num_trained_ham_fp"] ! data["num_trained_spam"] = self.num_trained_spam + \ ! totals["num_trained_spam"] ! data["num_trained_spam_fn"] = self.num_trained_spam_fn + \ ! totals["num_trained_spam_fn"] ! return data def _CalculateAdditional(self, data): --- 200,204 ---- totals["num_trained_spam"] += 1 ! self.messageinfo_db.set_persistent_statistics(totals) def _CalculateAdditional(self, data): *************** *** 322,329 **** push = chunks.append if session_only: - data = {} - data["num_seen"] = self.num_ham + self.num_spam + \ - self.num_unsure data["num_ham"] = self.num_ham data["num_spam"] = self.num_spam --- 328,333 ---- push = chunks.append + data = {} if session_only: data["num_ham"] = self.num_ham data["num_spam"] = self.num_spam *************** *** 334,342 **** data["num_trained_spam_fn"] = self.num_trained_spam_fn else: ! if not self.persistentStatsCalculated: ! # Load persistent stats. ! self.CalculatePersistentStats() ! self.persistentStatsCalculated = True ! data = self._CombineSessionAndTotal() push(_("Messages classified: %d") % (data["num_seen"],)) --- 338,347 ---- data["num_trained_spam_fn"] = self.num_trained_spam_fn else: ! for stat in ["num_ham", "num_spam", "num_unsure", ! "num_trained_spam", "num_trained_spam_fn", ! "num_trained_ham", "num_trained_ham_fp",]: ! data[stat] = self.totals[stat] ! data["num_seen"] = data["num_ham"] + data["num_spam"] + \ ! data["num_unsure"] push(_("Messages classified: %d") % (data["num_seen"],)) Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.75 retrieving revision 1.76 diff -C2 -d -r1.75 -r1.76 *** message.py 7 Apr 2006 02:33:15 -0000 1.75 --- message.py 28 Dec 2006 15:36:44 -0000 1.76 *************** *** 116,119 **** --- 116,120 ---- STATS_START_KEY = "Statistics start date" + STATS_STORAGE_KEY = "Persistent statistics" PERSISTENT_HAM_STRING = 'h' PERSISTENT_SPAM_STRING = 's' *************** *** 137,140 **** --- 138,151 ---- self.store() + def get_persistent_statistics(self): + if self.db.has_key(STATS_STORAGE_KEY): + return self.db[STATS_STORAGE_KEY] + else: + return None + + def set_persistent_statistics(self, stats): + self.db[STATS_STORAGE_KEY] = stats + self.store() + def __getstate__(self): return self.db *************** *** 402,406 **** if id == STATS_START_KEY: ! raise ValueError, "MsgId must not be" + STATS_START_KEY self.id = id --- 413,420 ---- if id == STATS_START_KEY: ! raise ValueError, "MsgId must not be " + STATS_START_KEY ! ! if id == STATS_STORAGE_KEY: ! raise ValueError, "MsgId must not be " + STATS_STORAGE_KEY self.id = id