[Spambayes-checkins] spambayes/utilities HistToGNU.py,NONE,1.1.2.1 loosecksum.py,NONE,1.1.2.1 mboxcount.py,NONE,1.1.2.1 rebal.py,NONE,1.1.2.1 split.py,NONE,1.1.2.1 splitn.py,NONE,1.1.2.1 splitndirs.py,NONE,1.1.2.1

Fri Jan 10 02:41:10 EST 2003

Update of /cvsroot/spambayes/spambayes/utilities
In directory sc8-pr-cvs1:/tmp/cvs-serv9389/utilities

Added Files:
      Tag: reorg-branch
	HistToGNU.py loosecksum.py mboxcount.py rebal.py split.py 
	splitn.py splitndirs.py 
Log Message:
Checkpointing before I head home.

Still to do: 
 - distutils magic to make sure that the 22compat modules are 
   installed when needed.
 - Walking through testtools and utilities and fixing imports.
 - Documentation.

hammie works, everything else that people use in day-to-day operation
should work - please give it a go.

--- NEW FILE: HistToGNU.py ---
#! /usr/bin/env python

"""HistToGNU.py

    Convert saved binary pickle of histograms to gnu plot output

Usage: %(program)s [options] [histogrampicklefile ...]

reads pickle filename from options if not specified

writes to stdout

"""

globalOptions = """
set grid
set xtics 5
set xrange [0.0:100.0]
"""

dataSetOptions="smooth unique"

from Options import options
from TestDriver import Hist

import sys
import cPickle as pickle

program = sys.argv[0]

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def loadHist(path):
    """Load the histogram pickle object"""
    return pickle.load(file(path))

def outputHist(hist, f=sys.stdout):
    """Output the Hist object to file f"""
    hist.fill_buckets()
    for i in range(len(hist.buckets)):
        n = hist.buckets[i]
        f.write("%.3f %d\n" % ( (100.0 * i) / hist.nbuckets, n))

def plot(files):
    """given a list of files, create gnu-plot file"""
    import cStringIO, os
    cmd = cStringIO.StringIO()
    cmd.write(globalOptions)
    args = []
    for file in files:
        args.append("""'-' %s title "%s" """ % (dataSetOptions, file))
    cmd.write('plot %s\n' % ",".join(args))
    for file in files:
        outputHist(loadHist(file), cmd)
        cmd.write('e\n')

    cmd.write('pause 100\n')
    print cmd.getvalue()

def main():
    import getopt

    try:
        opts, args = getopt.getopt(sys.argv[1:], '', [])
    except getopt.error, msg:
        usage(1, msg)

    if not args and options.save_histogram_pickles:
        args = []
        for f in ('ham', 'spam'):
            fname = "%s_%shist.pik" % (options.pickle_basename, f)
            args.append(fname)

    if args:
        plot(args)
    else:
        print "could not locate any files to plot"

if __name__ == "__main__":
    main()

--- NEW FILE: loosecksum.py ---
#!/usr/local/bin/python

"""
Compute a 'loose' checksum on the msg (file on cmdline or via stdin).

Attempts are made to eliminate content which tends to obscure the 'sameness'
of messages.  This is aimed particularly at spam, which tends to contains
lots of small differences across messages to try and thwart spam filters, in
hopes that at least one copy reaches its desitination.

Before calculating the checksum, this script does the following:

    * delete the message header

    * delete HTML tags which generally contain URLs

    * delete anything which looks like an email address or URL

    * finally, discard everything other than ascii letters and digits (note
      that this will almost certainly be ineffectual for spam written in
      eastern languages such as Korean)

An MD5 checksum is then computed for the resulting text and written to stdout.

"""

import getopt
import sys
import email.Parser
import md5
import re
import time
import binascii

def zaptags(data, *tags):
    """delete all tags (and /tags) from input data given as arguments"""
    for pat in tags:
        pat = pat.split(":")
        sub = ""
        if len(pat) >= 2:
            sub = pat[-1]
            pat = ":".join(pat[:-1])
        else:
            pat = pat[0]
            sub = ""
        if '\\' in sub:
            sub = _zap_esc_map(sub)
        try:
            data = re.sub(r'(?i)</?(%s)(?:\s[^>]*)?>'%pat, sub, data)
        except TypeError:
            print (pat, sub, data)
            raise
    return data

def clean(data):
    """Clean the obviously variable stuff from a chunk of data.

    The first (and perhaps only) use of this is to try and eliminate bits
    of data that keep multiple spam email messages from looking the same.
    """
    # Get rid of any HTML tags that hold URLs - tend to have varying content
    # I suppose i could just get rid of all HTML tags
    data = zaptags(data, 'a', 'img', 'base', 'frame')
    # delete anything that looks like an email address
    data = re.sub(r"(?i)[-a-z0-9_.+]+@[-a-z0-9_.]+\.([a-z]+)", "", data)
    # delete anything that looks like a url (catch bare urls)
    data = re.sub(r"(?i)(ftp|http|gopher)://[-a-z0-9_/?&%@=+:;#!~|.,$*]+", "", data)
    # delete pmguid: stuff (turns up frequently)
    data = re.sub(r"pmguid:[^.\s]+(\.[^.\s]+)*", "", data)
    # throw away everything other than alpha & digits
    return re.sub(r"[^A-Za-z0-9]+", "", data)

def flatten(obj):
    # I do not know how to use the email package very well - all I want here
    # is the body of obj expressed as a string - there is probably a better
    # way to accomplish this which I haven't discovered.
    # three types are possible: string, Message (hasattr(get_payload)), list
    if isinstance(obj, str):
        return obj
    if hasattr(obj, "get_payload"):
        return flatten(obj.get_payload())
    if isinstance(obj, list):
        return "\n".join([flatten(b) for b in obj])
    raise TypeError, ("unrecognized body type: %s" % type(obj))

def generate_checksum(f):
    body = flatten(email.Parser.Parser().parse(f))
    return binascii.b2a_hex(md5.new(clean(body)).digest())

def main(args):
    opts, args = getopt.getopt(args, "")
    for opt, arg in opts:
        pass
    if not args:
        inf = sys.stdin
    else:
        inf = file(args[0])

    print generate_checksum(inf)

if __name__ == "__main__":
    main(sys.argv[1:])

--- NEW FILE: mboxcount.py ---
#! /usr/bin/env python

"""Count the number of messages in Unix mboxes.

Usage: %(programs)s [-g] [-h] path1 ...
Options:

    -h
        Print this help message and exit
    -g
        Do globbing on each path.  This is helpful on Windows, where the
        native shells don't glob.
"""

"""
Stats for Barry's corpora, as of 26-Aug-2002, using then-current 2.3a0:

edu-sig-clean.mbox                 252 (+ unparseable: 0)
python-dev-clean.mbox             8326 (+ unparseable: 0)
mailman-developers-clean.mbox     2427 (+ unparseable: 0)
python-list-clean.mbox          159072 (+ unparseable: 2)
zope3-clean.mbox                  2177 (+ unparseable: 0)

Unparseable messages are likely spam.
zope3-clean.mbox is really from the zope3-dev mailing list.
The Python version matters because the email package varies across releases
in whether it uses strict or lax parsing.
"""

import sys
import mailbox
import email
import getopt
import glob

from mboxutils import get_message

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0

program = sys.argv[0]

def usage(code, msg=''):
    print >> sys.stderr, __doc__
    if msg:
        print >> sys.stderr, msg
    sys.exit(code)

def count(fname):
    fp = open(fname, 'rb')
    mbox = mailbox.PortableUnixMailbox(fp, get_message)
    goodcount = 0
    badcount = 0
    for msg in mbox:
        if msg["to"] is None and msg["cc"] is None:
            badcount += 1
        else:
            goodcount += 1
    fp.close()
    return goodcount, badcount

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hg', ['help'])
    except getopt.error, msg:
        usage(1, msg)

    doglob = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt == '-g':
            doglob = True

    for path in args:
        if doglob:
            fnames = glob.glob(path)
        else:
            fnames = [path]

        for fname in fnames:
            goodn, badn = count(fname)
            print "%-35s %7d (+ unparseable: %d)" % (fname, goodn, badn)

if __name__ == '__main__':
    main()

--- NEW FILE: rebal.py ---
#!/usr/bin/env python

"""
rebal.py - rebalance a ham or spam directory, moving files to or from
a reservoir directory as necessary.

usage: rebal.py [ options ]
options:
   -d     - dry run; display what would be moved, but don't do it [%(DRYRUN)s]
   -r res - specify an alternate reservoir [%(RESDIR)s]
   -s set - specify an alternate Set pfx [%(SETPFX)s]
   -n num - specify number of files per Set dir desired [%(NPERDIR)s]
   -v     - tell user what's happening [%(VERBOSE)s]
   -q     - be quiet about what's happening [not %(VERBOSE)s]
   -c     - confirm file moves into Set directory [%(CONFIRM)s]
   -Q     - don't confirm moves; this is independent of -v/-q

The script will work with a variable number of Set directories, but they
must already exist.

Example:

    rebal.py -r reservoir -s Set -n 300

This will move random files between the directory 'reservoir' and the
various subdirectories prefixed with 'Set', making sure no more than 300
files are left in the 'Set' directories when finished.

Example:

Suppose you want to shuffle your Set files around, winding up with 300 files
in each one, you can execute:

    rebal.py -n 0
    rebal.py -n 300

The first run will move all files from the various Data/Ham/Set directories
to the Data/Ham/reservoir directory.  The second run will randomly parcel
out 300 files to each of the Data/Ham/Set directories.
"""

import os
import sys
import random
import glob
import getopt

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0

# defaults
NPERDIR = 4000
RESDIR = 'Data/Ham/reservoir'
SETPFX = 'Data/Ham/Set'
VERBOSE = True
CONFIRM = True
DRYRUN = False

def usage(msg):
    msg = str(msg)
    if msg:
        print >> sys.stderr, msg
    print >> sys.stderr, """\
usage: rebal.py [ options ]
options:
   -d     - dry run; display what would be moved, but don't do it [%(DRYRUN)s]
   -r res - specify an alternate reservoir [%(RESDIR)s]
   -s set - specify an alternate Set pfx [%(SETPFX)s]
   -n num - specify number of files per dir [%(NPERDIR)s]
   -v     - tell user what's happening [%(VERBOSE)s]
   -q     - be quiet about what's happening [not %(VERBOSE)s]
   -c     - confirm file moves into Set directory [%(CONFIRM)s]
   -Q     - be quiet and don't confirm moves
""" % globals()

def migrate(f, dir, verbose):
    """rename f into dir, making sure to avoid name clashes."""
    base = os.path.split(f)[-1]
    out = os.path.join(dir, base)
    while os.path.exists(out):
        basename, ext = os.path.splitext(base)
        digits = random.randrange(100000000)
        out = os.path.join(dir, str(digits) + ext)
    if verbose:
        print "moving", f, "to", out
    os.rename(f, out)

def main(args):
    nperdir = NPERDIR
    resdir = RESDIR
    setpfx = SETPFX
    verbose = VERBOSE
    confirm = CONFIRM
    dryrun = DRYRUN

    try:
        opts, args = getopt.getopt(args, "dr:s:n:vqcQh")
    except getopt.GetoptError, msg:
        usage(msg)
        return 1

    for opt, arg in opts:
        if opt == "-n":
            nperdir = int(arg)
        elif opt == "-r":
            resdir = arg
        elif opt == "-s":
            setpfx = arg
        elif opt == "-v":
            verbose = True
        elif opt == "-c":
            confirm = True
        elif opt == "-q":
            verbose = False
        elif opt == "-Q":
            confirm = False
        elif opt == "-d":
            dryrun = True
        elif opt == "-h":
            usage('')
            return 0

    res = os.listdir(resdir)

    dirs = glob.glob(setpfx+"*")
    if dirs == []:
        print >> sys.stderr, "no directories beginning with", setpfx, "exist."
        return 1

    stuff = []
    n = len(res)
    for dir in dirs:
        fs = os.listdir(dir)
        n += len(fs)
        stuff.append((dir, fs))

    if nperdir * len(dirs) > n:
        print >> sys.stderr, "not enough files to go around - use lower -n."
        return 1

    # weak check against mixing ham and spam
    if (setpfx.find("Ham") >= 0 and resdir.find("Spam") >= 0 or
        setpfx.find("Spam") >= 0 and resdir.find("Ham") >= 0):
        yn = raw_input("Reservoir and Set dirs appear not to match. "
                       "Continue? (y/n) ")
        if yn.lower()[0:1] != 'y':
            return 1

    # if necessary, migrate random files to the reservoir
    for (dir, fs) in stuff:
        if nperdir >= len(fs):
            continue

        random.shuffle(fs)
        movethese = fs[nperdir:]
        del fs[nperdir:]
        if dryrun:
            print "would move", len(movethese), "files from", dir, \
                  "to reservoir", resdir
        else:
            for f in movethese:
                migrate(os.path.join(dir, f), resdir, verbose)
        res.extend(movethese)

    # randomize reservoir once so we can just bite chunks from the front
    random.shuffle(res)

    # grow Set* directories from the reservoir
    for (dir, fs) in stuff:
        if nperdir == len(fs):
            continue

        movethese = res[:nperdir-len(fs)]
        res = res[nperdir-len(fs):]
        if dryrun:
            print "would move", len(movethese), "files from reservoir", \
                  resdir, "to", dir
        else:
            for f in movethese:
                if confirm:
                    print file(os.path.join(resdir, f)).read()
                    ok = raw_input('good enough? ').lower()
                    if not ok.startswith('y'):
                        continue
                migrate(os.path.join(resdir, f), dir, verbose)
        fs.extend(movethese)

    return 0

if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))

--- NEW FILE: split.py ---
#! /usr/bin/env python

"""Split an mbox into two files based on a given percentage.

This script will troll through a Unix mbox file randomly assigning each
message to one of two bins.  The split is based on a given float percentage.
E.g.

    % split.py sourcembox 20 mbox20 mbox80

yields two mbox files, where mbox20 contains approximately 20% of the messages
and mbox80 contains 80% of the messages.  Messages are assigned to each bin
randomly.

Usage: %(programs)s [options] sourcembox percent file1 file2
Options:

    -h / --help
        Print this help message and exit

file1 and file2 are where the output goes.  Approximately percent % of
messages will go to file1 and (100 - percent) % of messages will go to file2.

percent is a floating point number between 1 and 99.  sourcembox is a Unix
mailbox file.  All arguments except -h/--help are required.
"""

import sys
import random
import mailbox
import email
import getopt

import mboxutils

program = sys.argv[0]

def usage(code, msg=''):
    print >> sys.stderr, __doc__
    if msg:
        print >> sys.stderr, msg
    sys.exit(code)

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'h', ['help'])
    except getopt.error, msg:
        usage(1, msg)

    bin1 = bin2 = percentage = mboxfile = None
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)

    try:
        mboxfile = args[0]
        percent = float(args[1])
        if not (0 < percent < 100):
            raise ValueError
        percent /= 100.0
        bin1 = args[2]
        bin2 = args[3]
    except IndexError:
        usage(1, 'Not enough arguments')
    except ValueError:
        usage(1, 'Percent argument must be a float between 1.0 and 99.0')

    # Cruise
    bin1out = open(bin1, 'wb')
    bin2out = open(bin2, 'wb')
    infp = open(mboxfile, 'rb')

    mbox = mailbox.PortableUnixMailbox(infp, mboxutils.get_message)
    for msg in mbox:
        if random.random() < percent:
            outfp = bin1out
        else:
            outfp = bin2out
        astext = str(msg)
        assert astext.endswith('\n')
        outfp.write(astext)

    outfp.close()
    bin1out.close()
    bin2out.close()

if __name__ == '__main__':
    main()

--- NEW FILE: splitn.py ---
#! /usr/bin/env python

"""Split an mbox into N random mboxes.

Usage: %(program)s [-h] [-s seed] [-v] -n N sourcembox outfilebase

Options:
    -h / --help
        Print this help message and exit

    -s seed
        Seed the random number generator with seed (an integer).
        By default, use system time at startup to seed.

    -v
        Verbose.  Displays a period for each 100 messages parsed.
        May display other stuff.

    -n N
        The number of output mboxes desired.  This is required.

Arguments:
    sourcembox
        The mbox to split.

    outfilebase
        The base path + name prefix for each of the N output files.
        Output mboxes have names of the form
            outfilebase + ("%%d.mbox" %% i)

Example:
    %(program)s -s 123 -n5 spam.mbox rspam

produces 5 mboxes, named rspam1.mbox through rspam5.mbox.  Each contains
a random selection of the messages in spam.mbox, and together they contain
every message in spam.mbox exactly once.  Each has approximately the same
number of messages.  spam.mbox is not altered.  In addition, the seed for
the random number generator is forced to 123, so that while the split is
random, it's reproducible.
"""

import sys
import random
import mailbox
import email
import getopt

import mboxutils

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0

program = sys.argv[0]

def usage(code, msg=''):
    print >> sys.stderr, __doc__ % globals()
    if msg:
        print >> sys.stderr, msg
    sys.exit(code)

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hn:s:v', ['help'])
    except getopt.error, msg:
        usage(1, msg)

    n = None
    verbose = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt == '-s':
            random.seed(int(arg))
        elif opt == '-n':
            n = int(arg)
        elif opt == '-v':
            verbose = True

    if n is None or n <= 1:
        usage(1, "an -n value > 1 is required")

    if len(args) != 2:
        usage(1, "input mbox name and output base path are required")
    inputpath, outputbasepath = args

    infile = file(inputpath, 'rb')
    outfiles = [file(outputbasepath + ("%d.mbox" % i), 'wb')
                for i in range(1, n+1)]

    mbox = mailbox.PortableUnixMailbox(infile, mboxutils.get_message)
    counter = 0
    for msg in mbox:
        i = random.randrange(n)
        astext = str(msg)
        outfiles[i].write(astext)
        counter += 1
        if verbose:
            if counter % 100 == 0:
                print '.',

    if verbose:
        print
        print counter, "messages split into", n, "files"
    infile.close()
    for f in outfiles:
        f.close()

if __name__ == '__main__':
    main()

--- NEW FILE: splitndirs.py ---
#! /usr/bin/env python

"""Split an mbox into N random directories of files.

Usage: %(program)s [-h] [-g] [-s seed] [-v] -n N sourcembox ... outdirbase

Options:
    -h / --help
        Print this help message and exit

    -g
        Do globbing on each sourcepath.  This is helpful on Windows, where
        the native shells don't glob, or when you have more mboxes than
        your shell allows you to specify on the commandline.

    -s seed
        Seed the random number generator with seed (an integer).
        By default, use system time at startup to seed.

    -v
        Verbose.  Displays a period for each 100 messages parsed.
        May display other stuff.

    -n N
        The number of output mboxes desired.  This is required.

Arguments:
    sourcembox
        The mbox or path to an mbox to split.

    outdirbase
        The base path + name prefix for each of the N output dirs.
        Output files have names of the form
            outdirbase + ("Set%%d/%%d" %% (i, n))

Example:
    %(program)s -s 123 -n5 Data/spam.mbox Data/Spam/Set

produces 5 directories, named Data/Spam/Set1 through Data/Spam/Set5.  Each
contains a random selection of the messages in spam.mbox, and together
they contain every message in spam.mbox exactly once.  Each has
approximately the same number of messages.  spam.mbox is not altered.  In
addition, the seed for the random number generator is forced to 123, so
that while the split is random, it's reproducible.
"""

import sys
import os
import random
import mailbox
import email
import getopt
import glob

import mboxutils

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0

program = sys.argv[0]

def usage(code, msg=''):
    print >> sys.stderr, __doc__ % globals()
    if msg:
        print >> sys.stderr, msg
    sys.exit(code)

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hgn:s:v', ['help'])
    except getopt.error, msg:
        usage(1, msg)

    doglob = False
    n = None
    verbose = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt == '-g':
            doglob = True
        elif opt == '-s':
            random.seed(int(arg))
        elif opt == '-n':
            n = int(arg)
        elif opt == '-v':
            verbose = True

    if n is None or n <= 1:
        usage(1, "an -n value > 1 is required")

    if len(args) < 2:
        usage(1, "input mbox name and output base path are required")
    inputpaths, outputbasepath = args[:-1], args[-1]

    outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)]
    for dir in outdirs:
        if not os.path.isdir(dir):
            os.makedirs(dir)

    counter = 0
    for inputpath in inputpaths:
        if doglob:
            inpaths = glob.glob(inputpath)
        else:
            inpaths = [inputpath]

        for inpath in inpaths:
            mbox = mboxutils.getmbox(inpath)
            for msg in mbox:
                i = random.randrange(n)
                astext = str(msg)
                #assert astext.endswith('\n')
                counter += 1
                msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
                msgfile.write(astext)
                msgfile.close()
                if verbose:
                    if counter % 100 == 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()

    if verbose:
        print
        print counter, "messages split into", n, "directories"

if __name__ == '__main__':
    main()