[Spambayes-checkins] spambayes/pspam README.txt,NONE,1.1 pop.py,NONE,1.1vmspam.ini,NONE,1.1zeo.sh,NONE,1.1

Mon Nov 4 04:44:22 2002

Update of /cvsroot/spambayes/spambayes/pspam
In directory usw-pr-cvs1:/tmp/cvs-serv21558/pspam

Added Files:
	README.txt pop.py scoremsg.py update.py vmspam.ini zeo.sh 
Log Message:
Initial checkin of pspam code.

--- NEW FILE: README.txt ---
pspam: persistent spambayes filtering system
--------------------------------------------

pspam uses a POP proxy to score incoming messages, a set of VM folders
to manage training data, and a ZODB database to manage data used by
the various applications.

The current code only works with a patched version of classifier.py.
Remove the object base class & change the class used to create new
WordInfo objects.

This directory contains:

pspam -- a Python package
pop.py -- a POP proxy based on SocketServer
scoremsg.py -- prints the evidence for a single message read from stdin
update.py -- a script to update training data from folders
vmspam.ini -- a sample configuration file
zeo.sh -- a script to start a ZEO server

The code depends on ZODB3, which you can download from
http://www.zope.org/Products/StandaloneZODB.

--- NEW FILE: pop.py ---
"""Spam-filtering proxy for a POP3 server.

The implementation uses the SocketServer module to run a
multi-threaded POP3 proxy.  It adds an X-Spambayes header with a spam
probability.  It scores a message using a persistent spambayes
classifier loaded from a ZEO server.

The strategy for adding spam headers is from Richie Hindler's
pop3proxy.py.  The STAT, LIST, RETR, and TOP commands are intercepted
to change the number of bytes the client is told to expect and/or to
insert the spam header.

XXX A POP3 server sometimes adds the number of bytes in the +OK
response to some commands when the POP3 spec doesn't require it to.
In those case, the proxy does not re-write the number of bytes.  I
assume the clients won't be confused by this behavior, because they
shouldn't be expecting to see the number of bytes.

POP3 is documented in RFC 1939.
"""

import SocketServer
import asyncore
import cStringIO
import email
import re
import socket
import sys
import threading
import time

import ZODB
from ZEO.ClientStorage import ClientStorage
import zLOG

from tokenizer import tokenize
import pspam.database
from pspam.options import options

HEADER = "X-Spambayes: %5.3f\r\n"
HEADER_SIZE = len(HEADER % 0.0)

class POP3ProxyServer(SocketServer.ThreadingTCPServer):

    allow_reuse_address = True

    def __init__(self, addr, handler, classifier, real_server, log, zodb):
        SocketServer.ThreadingTCPServer.__init__(self, addr, handler)
        self.classifier = classifier
        self.pop_server = real_server
        self.log = log
        self.zodb = zodb

class LogWrapper:

    def __init__(self, log, file):
        self.log = log
        self.file = file

    def readline(self):
        line = self.file.readline()
        self.log.write(line)
        return line

    def write(self, buf):
        self.log.write(buf)
        return self.file.write(buf)

    def close(self):
        self.file.close()

class POP3RequestHandler(SocketServer.StreamRequestHandler):
    """Act as proxy between POP client and server."""

    def connect_pop(self):
        # connect to the pop server
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.connect(self.server.pop_server)
        self.pop_rfile = LogWrapper(self.server.log, s.makefile("rb"))
        # the write side should be unbuffered
        self.pop_wfile = LogWrapper(self.server.log, s.makefile("wb", 0))

    def close_pop(self):
        self.pop_rfile.close()
        self.pop_wfile.close()

    def handle(self):
        zLOG.LOG("POP3", zLOG.INFO,
                 "Connection from %s" % repr(self.client_address))
        self.server.zodb.sync()
        self.sess_retr_count = 0
        self.connect_pop()
        try:
            self.handle_pop()
        finally:
            self.close_pop()
            if self.sess_retr_count == 1:
                ending = ""
            else:
                ending = "s"
            zLOG.LOG("POP3", zLOG.INFO,
                     "Ending session (%d message%s retrieved)"
                     % (self.sess_retr_count, ending))

    _multiline = {"RETR": True, "TOP": True,}
    _multiline_noargs = {"LIST": True, "UIDL": True,}

    def is_multiline(self, command, args):
        if command in self._multiline:
            return True
        if command in self._multiline_noargs and not args:
            return True
        return False

    def parse_request(self, req):
        parts = req.split()
        req = parts[0]
        args = tuple(parts[1:])
        return req, args

    def handle_pop(self):
        # send the initial server hello
        hello = self.pop_rfile.readline()
        self.wfile.write(hello)

        # now get client requests and return server responses
        while 1:
            line = self.rfile.readline()
            if line == '':
                break
            self.pop_wfile.write(line)
            if not self.handle_pop_response(line):
                break

    def handle_pop_response(self, req):
        # Return True if connection is still open
        cmd, args = self.parse_request(req)
        multiline = self.is_multiline(cmd, args)
        firstline = self.pop_rfile.readline()
        zLOG.LOG("POP3", zLOG.DEBUG, "command %s multiline %s resp %s"
                 % (cmd, multiline, firstline.strip()))
        if multiline:
            # Collect the entire response as one string
            resp = cStringIO.StringIO()
            while 1:
                line = self.pop_rfile.readline()
                resp.write(line)
                # The response is finished if we get . or an error.
                # XXX should handle byte-stuffed response
                if line == ".\r\n":
                    break
                if line.startswith("-ERR"):
                    break
            buf = resp.getvalue()
        else:
            buf = None

        handler = getattr(self, "handle_%s" % cmd, None)
        if handler:
            firstline, buf = handler(cmd, args, firstline, buf)

        self.wfile.write(firstline)
        if buf is not None:
            self.wfile.write(buf)
        if cmd == "QUIT":
            return False
        else:
            return True

    def handle_RETR(self, cmd, args, firstline, resp):
        if not resp:
            return firstline, resp
        try:
            msg = email.message_from_string(resp)
        except email.Errors.MessageParseError, err:
            zLOG.LOG("POP3", zLOG.WARNING,
                     "Failed to parse msg: %s" % err, error=sys.exc_info())
            resp = self.message_parse_error(resp)
        else:
            self.score_msg(msg)
            resp = msg.as_string()

        self.sess_retr_count += 1
        return firstline, resp

    def handle_TOP(self, cmd, args, firstline, resp):
        # XXX Just handle TOP like RETR?
        return self.handle_RETR(cmd, args, firstline, resp)

    rx_STAT = re.compile("\+OK (\d+) (\d+)(.*)", re.DOTALL)

    def handle_STAT(self, cmd, args, firstline, resp):
        # STAT returns the number of messages and the total size.  The
        # proxy must add the size of new headers to the total size.
        # Example: +OK 3 340
        mo = self.rx_STAT.match(firstline)
        if mo is None:
            return firstline, resp
        count, size, extra = mo.group(1, 2, 3)
        count = int(count)
        size = int(size)
        size += count * HEADER_SIZE
        firstline = "+OK %d %d%s" % (count, size, extra)
        return firstline, resp

    rx_LIST = re.compile("\+OK (\d+) (\d+)(.*)", re.DOTALL)
    rx_LIST_2 = re.compile("(\d+) (\d+)(.*)", re.DOTALL)

    def handle_LIST(self, cmd, args, firstline, resp):
        # If there are no args, LIST returns size info for each message.
        # If there is an arg, LIST return number and size for one message.
        mo = self.rx_LIST.match(firstline)
        if mo:
            # a single-line response
            n, size, extra = mo.group(1, 2, 3)
            size = int(size) + HEADER_SIZE
            firstline = "+OK %s %d%s" % (n, size, extra)
            return firstline, resp
        else:
            # possibility a multiline response
            if not firstline.startswith("+OK"):
                return firstline, resp
            # update each line of the response
            L = []
            for line in resp.split("\r\n"):
                if not line:
                    continue
                mo = self.rx_LIST_2.match(line)
                if not mo:
                    L.append(line)
                else:
                    n, size, extra = mo.group(1, 2, 3)
                    size = int(size) + HEADER_SIZE
                    L.append("%s %d%s" % (n, size, extra))
            return firstline, "\r\n".join(L)

    def message_parse_error(self, buf):
        # We get an error parsing the message.  We've already told the
        # client to expect more bytes that this buffer contains, but
        # there's not clean way to add the header.

        self.server.log.write("# error: %s\n" % repr(buf))

        # XXX what to do?  list's just add it after the first line
        score = self.server.classifier.spamprob(tokenize(buf))

        L = buf.split("\n")
        L.insert(1, HEADER % score)
        return "\n".join(L)

    def score_msg(self, msg):
        score = self.server.classifier.spamprob(tokenize(msg))
        msg.add_header("X-Spambayes", "%5.3f" % score)

def main():
    db = pspam.database.open()
    conn = db.open()
    r = conn.root()
    profile = r["profile"]

    log = open("/var/tmp/pop.log", "ab")
    print >> log, "+PROXY start", time.ctime()

    server = POP3ProxyServer(('', options.proxy_port),
                             POP3RequestHandler,
                             profile.classifier,
                             (options.server, options.server_port),
                             log,
                             conn,
                             )
    server.serve_forever()

if __name__ == "__main__":
    main()

--- NEW FILE: scoremsg.py ---
#! /usr/bin/env python
"""Score a message provided on stdin and show the evidence."""

import ZODB
from ZEO.ClientStorage import ClientStorage

from tokenizer import tokenize

import email
import sys

import pspam.options

def main(fp):
    cs = ClientStorage("/var/tmp/zeospam")
    db = ZODB.DB(cs)
    r = db.open().root()

    # make sure scoring uses the right set of options
    pspam.options.mergefile("/home/jeremy/src/vmspam/vmspam.ini")

    p = r["profile"]

    msg = email.message_from_file(fp)
    prob, evidence = p.classifier.spamprob(tokenize(msg), True)
    print "Score:", prob
    print
    print "Clues"
    print "-----"
    for clue, prob in evidence:
        print clue, prob
##    print
##    print msg

if __name__ == "__main__":
    main(sys.stdin)

--- NEW FILE: update.py ---
import getopt
import os
import sys

import ZODB
from ZEO.ClientStorage import ClientStorage

import pspam.database
from pspam.profile import Profile
from pspam.options import options

def folder_exists(L, p):
    """Return true folder with path p exists in list L."""
    for f in L:
        if f.path == p:
            return True
    return False

def main(rebuild=False):
    db = pspam.database.open()
    r = db.open().root()

    profile = r.get("profile")
    if profile is None or rebuild:
        # if there is no profile, create it
        profile = r["profile"] = Profile(options.folder_dir)
        get_transaction().commit()

    # check for new folders of training data
    for ham in options.ham_folders:
        p = os.path.join(options.folder_dir, ham)
        if not folder_exists(profile.hams, p):
            profile.add_ham(p)

    for spam in options.spam_folders:
        p = os.path.join(options.folder_dir, spam)
        if not folder_exists(profile.spams, p):
            profile.add_spam(p)
    get_transaction().commit()

    # read new messages from folders
    profile.update()
    get_transaction().commit()

    db.close()

if __name__ == "__main__":
    FORCE_REBUILD = False
    opts, args = getopt.getopt(sys.argv[1:], 'F')
    for k, v in opts:
        if k == '-F':
            FORCE_REBUILD = True

    main(FORCE_REBUILD)

--- NEW FILE: vmspam.ini ---
[Train]
folder_dir: /home/jeremy/Mail
spam_folders: train/spam
ham_folders: train/ham

[Score]
max_ham: 0.05
min_spam: 0.99

[Proxy]
server: mail.zope.com
server_port: 110
proxy_port: 1111
log_pop_session: true
log_pop_session_file: /var/tmp/pop.log

[ZODB]
zeo_addr: /var/tmp/zeospam
event_log_file: /var/tmp/zeospam.log
event_log_severity: 0
cache_size: 2000

--- NEW FILE: zeo.sh ---
#! /bin/bash

export STUPID_LOG_FILE=/var/tmp/zeospam.log
export LIBDIR=/usr/local/lib/python2.3/site-packages
python2.3 $LIBDIR/ZEO/start.py -U /var/tmp/zeospam /var/tmp/zeospam.fs