From montanaro at users.sourceforge.net Thu Jan 25 20:03:16 2007 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Thu, 25 Jan 2007 11:03:16 -0800 Subject: [Spambayes-checkins] spambayes/Outlook2000/docs troubleshooting.html, 1.28, 1.29 Message-ID: <20070125190319.AA9F81E4010@bag.python.org> Update of /cvsroot/spambayes/spambayes/Outlook2000/docs In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv10558/Outlook2000/docs Modified Files: troubleshooting.html Log Message: correct sourceforge url Index: troubleshooting.html =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/docs/troubleshooting.html,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** troubleshooting.html 13 May 2005 03:16:31 -0000 1.28 --- troubleshooting.html 25 Jan 2007 19:03:01 -0000 1.29 *************** *** 50,54 **** to the Outlook addin.
This changelog lists the commits on the spambayes projects before the separate project was set up. See also the ! old CVS repository, but don't forget that it's now out of date, and you probably want to be looking at the current CVS.
--- 2,6 ----This changelog lists the commits on the spambayes projects before the separate project was set up. See also the ! old CVS repository, but don't forget that it's now out of date, and you probably want to be looking at the current CVS.
From montanaro at users.sourceforge.net Mon Jan 29 05:18:46 2007 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sun, 28 Jan 2007 20:18:46 -0800 Subject: [Spambayes-checkins] spambayes/spambayes MoinSecurityPolicy.py, NONE, 1.1 Message-ID: <20070129041848.D17411E4008@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv17517 Added Files: MoinSecurityPolicy.py Log Message: This doesn't work but I need to check it in to establish a baseline. The SpamBayes parts of things seem to work (though I just muddled with the spam_db location - that needs testing). The fundamental problems are in the MoinMoin interface (as I knew they would be). --- NEW FILE: MoinSecurityPolicy.py --- #!/usr/bin/env python # -*- coding: iso-8859-1 -*- """ This module implements a security policy for MoinMoin based on the SpamBayes classifier. To use it, import it like so in your wikiconfig.py file: from spambayes.MoinSecurityPolicy import SecurityPolicy Two pages are special, HamPages and SpamPages. Each refers to a specific revision of the raw version of particulars wiki pages. When either of these pages is updated the SpamBayes database is rebuilt based on the pages they reference. When any other page is updated it is scored against the current database. If its score is <= SecurityPolicy.ham_cutoff the edit is accepted. If not, the page edit is accepted but reverted and a reference to the reverted page revision is mailed to the members of the AdminGroup for review. If it is only possibly spam (score between SecurityPolicy.ham_cutoff and SecurityPolicy.spam_cutoff) the recipients are instructed to add it to either HamPages or SpamPages as appropriate. If it is truly spam (score >= SecurityPolicy.spam_cutoff), the recipients are instructed to add it to HamPages if it is actually okay, but to simply discard it otherwise. The HamPages and SpamPages pages are formatted as any other *Group page, a top-level list forms a group while everything else is ignored. The ham_cutoff, spam_cutoff and spam_db attributes are defined at the class level to make it easy for the user to change their values. The defaults are: ham_cutoff 0.15 spam_cutoff 0.60 spam_db 'spam.db' The spam_db attribute should always be a relative path (should not start with '/'). When relative it will be taken relative to the directory containing the event-log file. """ import os import atexit import urllib import urlparse from MoinMoin.security import Permissions from MoinMoin.wikidicts import Group from MoinMoin.user import User, getUserId from MoinMoin.util.mail import sendmail from MoinMoin.Page import Page from MoinMoin.PageEditor import PageEditor from spambayes import hammie, storage from spambayes.tokenizer import log2, Tokenizer, numeric_entity_re, \ numeric_entity_replacer, crack_urls, breaking_entity_re, html_re, \ tokenize_word class SecurityPolicy(Permissions): ham_cutoff = 0.15 spam_cutoff = 0.60 spam_db = "spam.db" def __init__(self, user): Permissions.__init__(self, user) self.sbayes = None def open_spamdb(self, request): if self.sbayes is None: event_log = request.rootpage.getPagePath('event-log', isfile=1) spam_db = os.path.join(os.path.dirname(event_log, self.spam_db)) self.sbayes = Hammie(storage.open_storage(spam_db, "pickle", 'c')) atexit.register(self.close_spamdb) def close_spamdb(self): if self.sbayes is not None: self.sbayes.store() self.sbayes = None def retrain(self, request): self.close_spamdb() if os.path.exists(self.spam_db): os.unlink(self.spam_db) self.open_spamdb(request) nham = nspam = 0 for url in Group(request, "HamPages").members(): scheme, netloc, path, params, query, frag = urlparse.urlparse(url) rev = 0 for pair in query.split("&"): key, val = pair.split("=") if key == "rev": raw = int(val) break pg = Page(request, path[1:], rev=rev) self.sbayes.train_ham(pg.get_raw_body()) nham += 1 for url in Group(request, "SpamPages").members(): scheme, netloc, path, params, query, frag = urlparse.urlparse(url) rev = 0 for pair in query.split("&"): key, val = pair.split("=") if key == "rev": raw = int(val) break pg = Page(request, path[1:], rev=rev) self.sbayes.train_spam(pg.get_raw_body()) nspam += 1 self.close_spamdb() return (nham, nspam) def save(self, editor, newtext, rev, **kw): score = self.sbayes.score(newtext) save_result = Permissions.save(self, editor, newtext, rev, **kw) if save_result and editor.page_name in ("HamPages", "SpamPages"): self.retrain(editor.request) return save_result if score < self.ham_cutoff: # File checks out spamwise. Return the default save result. return save_result if not save_result: return save_result # Now the fun begins. We scored the page and found that it is # either possible or probable spam. However, we saved it. (We # wanted to do that so we would have a copy to score later.) We # need to revert the save and send the URL of the suspect page # to the users in AdminGroup. To make matters worse, the user # may have write permission but not revert permission. So we # have to force the reversion. That requires a bit of # cut-n-paste from wikiaction.do_revert. self.force_revert(editor.page_name, editor.request) ## self.mail_admins_about(editor.request, editor.page_name, score) def force_revert(self, pagename, request): from MoinMoin.PageEditor import PageEditor rev = int(request.form['rev'][0]) revstr = '%08d' % rev oldpg = Page(request, pagename, rev=rev) pg = PageEditor(request, pagename) _ = request.getText msg = _("Thank you for your changes. Your attention to detail is appreciated.") try: pg._write_file(oldpg.get_raw_body(), action="SAVE/REVERT", extra=revstr) pg.clean_acl_cache() except pg.SaveError, msg: pass # msg contain a unicode string savemsg = unicode(msg) request.reset() pg.send_page(request, msg=savemsg) return None def mail_admins_about(self, request, page_name, score): """Send email to the AdminGroup about a suspect page.""" # This does not yet work. I've yet to figure out how to extract the # email addresses of the members of the AdminGroup. return admin_text = Page(request, "AdminGroup").get_raw_body() group = Group(request, admin_text) emails = [] for name in group.members(): uid = getUserId(request, name) if uid is None: continue u = User(request, uid) emails.append(u.email) if score < self.spam_cutoff: subject = "Possible wiki spam" text = """\ This page as submitted to the wiki might be spam: %(page_name)s If that is not the case, add the page's URL (including action=raw and the revision number) to HamPages then revert the page to that revision. If it is spam, add it instead to SpamPages. """ % locals() else: subject = "Probable wiki spam" text = """\ This page as submitted to the wiki is likely to be spam: %(page_name)s If that is not the case, add the page's URL (including action=raw and the revision number) to HamPages then revert the page to that revision. If it is spam, do nothing. """ % locals() sendmail(request, emails, subject, text) class Tokenizer(Tokenizer): def tokenize(self, text): """Tokenize a chunk of text. Pulled mostly verbatim from the SpamBayes code. """ maxword = 20 # Replace numeric character entities (like a for the letter # 'a'). text = numeric_entity_re.sub(numeric_entity_replacer, text) # Crack open URLs and extract useful bits of marrow... for cracker in (crack_urls,): text, tokens = cracker(text) for t in tokens: yield t # Remove HTML/XML tags. Also .
andtags should # create a space too. text = breaking_entity_re.sub(' ', text) # It's important to eliminate HTML tags rather than, e.g., # replace them with a blank (as this code used to do), else # simple tricks like # Wrinkle Reduction # can be used to disguise words.
andwere special- # cased just above (because browsers break text on those, # they can't be used to hide words effectively). text = html_re.sub('', text) # Tokenize everything in the body. for w in text.split(): n = len(w) # Make sure this range matches in tokenize_word(). if 3 <= n <= maxword: yield w elif n >= 3: for t in tokenize_word(w): yield t class Hammie(hammie.Hammie): def __init__(self, bayes): hammie.Hammie.__init__(self, bayes) self.tokenizer = Tokenizer() def _scoremsg(self, msg, evidence=False): return self.bayes.spamprob(self.tokenizer.tokenize(msg), evidence) def train(self, msg, is_spam, add_header=False): self.bayes.learn(self.tokenizer.tokenize(msg), is_spam) From montanaro at users.sourceforge.net Mon Jan 29 05:22:56 2007 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sun, 28 Jan 2007 20:22:56 -0800 Subject: [Spambayes-checkins] spambayes/spambayes MoinSecurityPolicy.py, 1.1, 1.2 Message-ID: <20070129042258.9F82B1E4008@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv19591 Modified Files: MoinSecurityPolicy.py Log Message: There, that's better. Index: MoinSecurityPolicy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/MoinSecurityPolicy.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** MoinSecurityPolicy.py 29 Jan 2007 04:18:43 -0000 1.1 --- MoinSecurityPolicy.py 29 Jan 2007 04:22:54 -0000 1.2 *************** *** 66,70 **** if self.sbayes is None: event_log = request.rootpage.getPagePath('event-log', isfile=1) ! spam_db = os.path.join(os.path.dirname(event_log, self.spam_db)) self.sbayes = Hammie(storage.open_storage(spam_db, "pickle", 'c')) atexit.register(self.close_spamdb) --- 66,70 ---- if self.sbayes is None: event_log = request.rootpage.getPagePath('event-log', isfile=1) ! spam_db = os.path.join(os.path.dirname(event_log), self.spam_db) self.sbayes = Hammie(storage.open_storage(spam_db, "pickle", 'c')) atexit.register(self.close_spamdb) *************** *** 107,110 **** --- 107,111 ---- def save(self, editor, newtext, rev, **kw): + self.open_spamdb(editor.request) score = self.sbayes.score(newtext) save_result = Permissions.save(self, editor, newtext, rev, **kw)