[Spambayes-checkins] spambayes/pspam README.txt,NONE,1.1
pop.py,NONE,1.1vmspam.ini,NONE,1.1zeo.sh,NONE,1.1
Jeremy Hylton
jhylton@users.sourceforge.net
Mon Nov 4 04:44:22 2002
- Previous message: [Spambayes-checkins] spambayes/pspam/pspam - New directory
- Next message: [Spambayes-checkins]
spambayes/pspam/pspam __init__.py,NONE,1.1 database.py,NONE,1.1
folder.py,NONE,1.1 message.py,NONE,1.1 options.py,NONE,1.1
profile.py,NONE,1.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/spambayes/spambayes/pspam
In directory usw-pr-cvs1:/tmp/cvs-serv21558/pspam
Added Files:
README.txt pop.py scoremsg.py update.py vmspam.ini zeo.sh
Log Message:
Initial checkin of pspam code.
--- NEW FILE: README.txt ---
pspam: persistent spambayes filtering system
--------------------------------------------
pspam uses a POP proxy to score incoming messages, a set of VM folders
to manage training data, and a ZODB database to manage data used by
the various applications.
The current code only works with a patched version of classifier.py.
Remove the object base class & change the class used to create new
WordInfo objects.
This directory contains:
pspam -- a Python package
pop.py -- a POP proxy based on SocketServer
scoremsg.py -- prints the evidence for a single message read from stdin
update.py -- a script to update training data from folders
vmspam.ini -- a sample configuration file
zeo.sh -- a script to start a ZEO server
The code depends on ZODB3, which you can download from
http://www.zope.org/Products/StandaloneZODB.
--- NEW FILE: pop.py ---
"""Spam-filtering proxy for a POP3 server.
The implementation uses the SocketServer module to run a
multi-threaded POP3 proxy. It adds an X-Spambayes header with a spam
probability. It scores a message using a persistent spambayes
classifier loaded from a ZEO server.
The strategy for adding spam headers is from Richie Hindler's
pop3proxy.py. The STAT, LIST, RETR, and TOP commands are intercepted
to change the number of bytes the client is told to expect and/or to
insert the spam header.
XXX A POP3 server sometimes adds the number of bytes in the +OK
response to some commands when the POP3 spec doesn't require it to.
In those case, the proxy does not re-write the number of bytes. I
assume the clients won't be confused by this behavior, because they
shouldn't be expecting to see the number of bytes.
POP3 is documented in RFC 1939.
"""
import SocketServer
import asyncore
import cStringIO
import email
import re
import socket
import sys
import threading
import time
import ZODB
from ZEO.ClientStorage import ClientStorage
import zLOG
from tokenizer import tokenize
import pspam.database
from pspam.options import options
HEADER = "X-Spambayes: %5.3f\r\n"
HEADER_SIZE = len(HEADER % 0.0)
class POP3ProxyServer(SocketServer.ThreadingTCPServer):
allow_reuse_address = True
def __init__(self, addr, handler, classifier, real_server, log, zodb):
SocketServer.ThreadingTCPServer.__init__(self, addr, handler)
self.classifier = classifier
self.pop_server = real_server
self.log = log
self.zodb = zodb
class LogWrapper:
def __init__(self, log, file):
self.log = log
self.file = file
def readline(self):
line = self.file.readline()
self.log.write(line)
return line
def write(self, buf):
self.log.write(buf)
return self.file.write(buf)
def close(self):
self.file.close()
class POP3RequestHandler(SocketServer.StreamRequestHandler):
"""Act as proxy between POP client and server."""
def connect_pop(self):
# connect to the pop server
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(self.server.pop_server)
self.pop_rfile = LogWrapper(self.server.log, s.makefile("rb"))
# the write side should be unbuffered
self.pop_wfile = LogWrapper(self.server.log, s.makefile("wb", 0))
def close_pop(self):
self.pop_rfile.close()
self.pop_wfile.close()
def handle(self):
zLOG.LOG("POP3", zLOG.INFO,
"Connection from %s" % repr(self.client_address))
self.server.zodb.sync()
self.sess_retr_count = 0
self.connect_pop()
try:
self.handle_pop()
finally:
self.close_pop()
if self.sess_retr_count == 1:
ending = ""
else:
ending = "s"
zLOG.LOG("POP3", zLOG.INFO,
"Ending session (%d message%s retrieved)"
% (self.sess_retr_count, ending))
_multiline = {"RETR": True, "TOP": True,}
_multiline_noargs = {"LIST": True, "UIDL": True,}
def is_multiline(self, command, args):
if command in self._multiline:
return True
if command in self._multiline_noargs and not args:
return True
return False
def parse_request(self, req):
parts = req.split()
req = parts[0]
args = tuple(parts[1:])
return req, args
def handle_pop(self):
# send the initial server hello
hello = self.pop_rfile.readline()
self.wfile.write(hello)
# now get client requests and return server responses
while 1:
line = self.rfile.readline()
if line == '':
break
self.pop_wfile.write(line)
if not self.handle_pop_response(line):
break
def handle_pop_response(self, req):
# Return True if connection is still open
cmd, args = self.parse_request(req)
multiline = self.is_multiline(cmd, args)
firstline = self.pop_rfile.readline()
zLOG.LOG("POP3", zLOG.DEBUG, "command %s multiline %s resp %s"
% (cmd, multiline, firstline.strip()))
if multiline:
# Collect the entire response as one string
resp = cStringIO.StringIO()
while 1:
line = self.pop_rfile.readline()
resp.write(line)
# The response is finished if we get . or an error.
# XXX should handle byte-stuffed response
if line == ".\r\n":
break
if line.startswith("-ERR"):
break
buf = resp.getvalue()
else:
buf = None
handler = getattr(self, "handle_%s" % cmd, None)
if handler:
firstline, buf = handler(cmd, args, firstline, buf)
self.wfile.write(firstline)
if buf is not None:
self.wfile.write(buf)
if cmd == "QUIT":
return False
else:
return True
def handle_RETR(self, cmd, args, firstline, resp):
if not resp:
return firstline, resp
try:
msg = email.message_from_string(resp)
except email.Errors.MessageParseError, err:
zLOG.LOG("POP3", zLOG.WARNING,
"Failed to parse msg: %s" % err, error=sys.exc_info())
resp = self.message_parse_error(resp)
else:
self.score_msg(msg)
resp = msg.as_string()
self.sess_retr_count += 1
return firstline, resp
def handle_TOP(self, cmd, args, firstline, resp):
# XXX Just handle TOP like RETR?
return self.handle_RETR(cmd, args, firstline, resp)
rx_STAT = re.compile("\+OK (\d+) (\d+)(.*)", re.DOTALL)
def handle_STAT(self, cmd, args, firstline, resp):
# STAT returns the number of messages and the total size. The
# proxy must add the size of new headers to the total size.
# Example: +OK 3 340
mo = self.rx_STAT.match(firstline)
if mo is None:
return firstline, resp
count, size, extra = mo.group(1, 2, 3)
count = int(count)
size = int(size)
size += count * HEADER_SIZE
firstline = "+OK %d %d%s" % (count, size, extra)
return firstline, resp
rx_LIST = re.compile("\+OK (\d+) (\d+)(.*)", re.DOTALL)
rx_LIST_2 = re.compile("(\d+) (\d+)(.*)", re.DOTALL)
def handle_LIST(self, cmd, args, firstline, resp):
# If there are no args, LIST returns size info for each message.
# If there is an arg, LIST return number and size for one message.
mo = self.rx_LIST.match(firstline)
if mo:
# a single-line response
n, size, extra = mo.group(1, 2, 3)
size = int(size) + HEADER_SIZE
firstline = "+OK %s %d%s" % (n, size, extra)
return firstline, resp
else:
# possibility a multiline response
if not firstline.startswith("+OK"):
return firstline, resp
# update each line of the response
L = []
for line in resp.split("\r\n"):
if not line:
continue
mo = self.rx_LIST_2.match(line)
if not mo:
L.append(line)
else:
n, size, extra = mo.group(1, 2, 3)
size = int(size) + HEADER_SIZE
L.append("%s %d%s" % (n, size, extra))
return firstline, "\r\n".join(L)
def message_parse_error(self, buf):
# We get an error parsing the message. We've already told the
# client to expect more bytes that this buffer contains, but
# there's not clean way to add the header.
self.server.log.write("# error: %s\n" % repr(buf))
# XXX what to do? list's just add it after the first line
score = self.server.classifier.spamprob(tokenize(buf))
L = buf.split("\n")
L.insert(1, HEADER % score)
return "\n".join(L)
def score_msg(self, msg):
score = self.server.classifier.spamprob(tokenize(msg))
msg.add_header("X-Spambayes", "%5.3f" % score)
def main():
db = pspam.database.open()
conn = db.open()
r = conn.root()
profile = r["profile"]
log = open("/var/tmp/pop.log", "ab")
print >> log, "+PROXY start", time.ctime()
server = POP3ProxyServer(('', options.proxy_port),
POP3RequestHandler,
profile.classifier,
(options.server, options.server_port),
log,
conn,
)
server.serve_forever()
if __name__ == "__main__":
main()
--- NEW FILE: scoremsg.py ---
#! /usr/bin/env python
"""Score a message provided on stdin and show the evidence."""
import ZODB
from ZEO.ClientStorage import ClientStorage
from tokenizer import tokenize
import email
import sys
import pspam.options
def main(fp):
cs = ClientStorage("/var/tmp/zeospam")
db = ZODB.DB(cs)
r = db.open().root()
# make sure scoring uses the right set of options
pspam.options.mergefile("/home/jeremy/src/vmspam/vmspam.ini")
p = r["profile"]
msg = email.message_from_file(fp)
prob, evidence = p.classifier.spamprob(tokenize(msg), True)
print "Score:", prob
print
print "Clues"
print "-----"
for clue, prob in evidence:
print clue, prob
## print
## print msg
if __name__ == "__main__":
main(sys.stdin)
--- NEW FILE: update.py ---
import getopt
import os
import sys
import ZODB
from ZEO.ClientStorage import ClientStorage
import pspam.database
from pspam.profile import Profile
from pspam.options import options
def folder_exists(L, p):
"""Return true folder with path p exists in list L."""
for f in L:
if f.path == p:
return True
return False
def main(rebuild=False):
db = pspam.database.open()
r = db.open().root()
profile = r.get("profile")
if profile is None or rebuild:
# if there is no profile, create it
profile = r["profile"] = Profile(options.folder_dir)
get_transaction().commit()
# check for new folders of training data
for ham in options.ham_folders:
p = os.path.join(options.folder_dir, ham)
if not folder_exists(profile.hams, p):
profile.add_ham(p)
for spam in options.spam_folders:
p = os.path.join(options.folder_dir, spam)
if not folder_exists(profile.spams, p):
profile.add_spam(p)
get_transaction().commit()
# read new messages from folders
profile.update()
get_transaction().commit()
db.close()
if __name__ == "__main__":
FORCE_REBUILD = False
opts, args = getopt.getopt(sys.argv[1:], 'F')
for k, v in opts:
if k == '-F':
FORCE_REBUILD = True
main(FORCE_REBUILD)
--- NEW FILE: vmspam.ini ---
[Train]
folder_dir: /home/jeremy/Mail
spam_folders: train/spam
ham_folders: train/ham
[Score]
max_ham: 0.05
min_spam: 0.99
[Proxy]
server: mail.zope.com
server_port: 110
proxy_port: 1111
log_pop_session: true
log_pop_session_file: /var/tmp/pop.log
[ZODB]
zeo_addr: /var/tmp/zeospam
event_log_file: /var/tmp/zeospam.log
event_log_severity: 0
cache_size: 2000
--- NEW FILE: zeo.sh ---
#! /bin/bash
export STUPID_LOG_FILE=/var/tmp/zeospam.log
export LIBDIR=/usr/local/lib/python2.3/site-packages
python2.3 $LIBDIR/ZEO/start.py -U /var/tmp/zeospam /var/tmp/zeospam.fs
- Previous message: [Spambayes-checkins] spambayes/pspam/pspam - New directory
- Next message: [Spambayes-checkins]
spambayes/pspam/pspam __init__.py,NONE,1.1 database.py,NONE,1.1
folder.py,NONE,1.1 message.py,NONE,1.1 options.py,NONE,1.1
profile.py,NONE,1.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]