[Spambayes-checkins] spambayes/utilities HistToGNU.py,NONE,1.1.2.1
loosecksum.py,NONE,1.1.2.1 mboxcount.py,NONE,1.1.2.1
rebal.py,NONE,1.1.2.1 split.py,NONE,1.1.2.1 splitn.py,NONE,1.1.2.1
splitndirs.py,NONE,1.1.2.1
Anthony Baxter
anthonybaxter at users.sourceforge.net
Fri Jan 10 02:41:10 EST 2003
- Previous message: [Spambayes-checkins]
spambayes/testtools cmp.py,NONE,1.1.2.1 fpfn.py,NONE,1.1.2.1
mboxtest.py,NONE,1.1.2.1 rates.py,NONE,1.1.2.1
simplexloop.py,NONE,1.1.2.1 table.py,NONE,1.1.2.1
timcv.py,NONE,1.1.2.1 timtest.py,NONE,1.1.2.1weaktest.py,NONE,1.1.2.1
- Next message: [Spambayes-checkins]
website/pics gutter-hi.png,NONE,1.1 gutter.png,NONE,1.1 logo.png,NONE,1.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/spambayes/spambayes/utilities
In directory sc8-pr-cvs1:/tmp/cvs-serv9389/utilities
Added Files:
Tag: reorg-branch
HistToGNU.py loosecksum.py mboxcount.py rebal.py split.py
splitn.py splitndirs.py
Log Message:
Checkpointing before I head home.
Still to do:
- distutils magic to make sure that the 22compat modules are
installed when needed.
- Walking through testtools and utilities and fixing imports.
- Documentation.
hammie works, everything else that people use in day-to-day operation
should work - please give it a go.
--- NEW FILE: HistToGNU.py ---
#! /usr/bin/env python
"""HistToGNU.py
Convert saved binary pickle of histograms to gnu plot output
Usage: %(program)s [options] [histogrampicklefile ...]
reads pickle filename from options if not specified
writes to stdout
"""
globalOptions = """
set grid
set xtics 5
set xrange [0.0:100.0]
"""
dataSetOptions="smooth unique"
from Options import options
from TestDriver import Hist
import sys
import cPickle as pickle
program = sys.argv[0]
def usage(code, msg=''):
"""Print usage message and sys.exit(code)."""
if msg:
print >> sys.stderr, msg
print >> sys.stderr
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
def loadHist(path):
"""Load the histogram pickle object"""
return pickle.load(file(path))
def outputHist(hist, f=sys.stdout):
"""Output the Hist object to file f"""
hist.fill_buckets()
for i in range(len(hist.buckets)):
n = hist.buckets[i]
f.write("%.3f %d\n" % ( (100.0 * i) / hist.nbuckets, n))
def plot(files):
"""given a list of files, create gnu-plot file"""
import cStringIO, os
cmd = cStringIO.StringIO()
cmd.write(globalOptions)
args = []
for file in files:
args.append("""'-' %s title "%s" """ % (dataSetOptions, file))
cmd.write('plot %s\n' % ",".join(args))
for file in files:
outputHist(loadHist(file), cmd)
cmd.write('e\n')
cmd.write('pause 100\n')
print cmd.getvalue()
def main():
import getopt
try:
opts, args = getopt.getopt(sys.argv[1:], '', [])
except getopt.error, msg:
usage(1, msg)
if not args and options.save_histogram_pickles:
args = []
for f in ('ham', 'spam'):
fname = "%s_%shist.pik" % (options.pickle_basename, f)
args.append(fname)
if args:
plot(args)
else:
print "could not locate any files to plot"
if __name__ == "__main__":
main()
--- NEW FILE: loosecksum.py ---
#!/usr/local/bin/python
"""
Compute a 'loose' checksum on the msg (file on cmdline or via stdin).
Attempts are made to eliminate content which tends to obscure the 'sameness'
of messages. This is aimed particularly at spam, which tends to contains
lots of small differences across messages to try and thwart spam filters, in
hopes that at least one copy reaches its desitination.
Before calculating the checksum, this script does the following:
* delete the message header
* delete HTML tags which generally contain URLs
* delete anything which looks like an email address or URL
* finally, discard everything other than ascii letters and digits (note
that this will almost certainly be ineffectual for spam written in
eastern languages such as Korean)
An MD5 checksum is then computed for the resulting text and written to stdout.
"""
import getopt
import sys
import email.Parser
import md5
import re
import time
import binascii
def zaptags(data, *tags):
"""delete all tags (and /tags) from input data given as arguments"""
for pat in tags:
pat = pat.split(":")
sub = ""
if len(pat) >= 2:
sub = pat[-1]
pat = ":".join(pat[:-1])
else:
pat = pat[0]
sub = ""
if '\\' in sub:
sub = _zap_esc_map(sub)
try:
data = re.sub(r'(?i)</?(%s)(?:\s[^>]*)?>'%pat, sub, data)
except TypeError:
print (pat, sub, data)
raise
return data
def clean(data):
"""Clean the obviously variable stuff from a chunk of data.
The first (and perhaps only) use of this is to try and eliminate bits
of data that keep multiple spam email messages from looking the same.
"""
# Get rid of any HTML tags that hold URLs - tend to have varying content
# I suppose i could just get rid of all HTML tags
data = zaptags(data, 'a', 'img', 'base', 'frame')
# delete anything that looks like an email address
data = re.sub(r"(?i)[-a-z0-9_.+]+@[-a-z0-9_.]+\.([a-z]+)", "", data)
# delete anything that looks like a url (catch bare urls)
data = re.sub(r"(?i)(ftp|http|gopher)://[-a-z0-9_/?&%@=+:;#!~|.,$*]+", "", data)
# delete pmguid: stuff (turns up frequently)
data = re.sub(r"pmguid:[^.\s]+(\.[^.\s]+)*", "", data)
# throw away everything other than alpha & digits
return re.sub(r"[^A-Za-z0-9]+", "", data)
def flatten(obj):
# I do not know how to use the email package very well - all I want here
# is the body of obj expressed as a string - there is probably a better
# way to accomplish this which I haven't discovered.
# three types are possible: string, Message (hasattr(get_payload)), list
if isinstance(obj, str):
return obj
if hasattr(obj, "get_payload"):
return flatten(obj.get_payload())
if isinstance(obj, list):
return "\n".join([flatten(b) for b in obj])
raise TypeError, ("unrecognized body type: %s" % type(obj))
def generate_checksum(f):
body = flatten(email.Parser.Parser().parse(f))
return binascii.b2a_hex(md5.new(clean(body)).digest())
def main(args):
opts, args = getopt.getopt(args, "")
for opt, arg in opts:
pass
if not args:
inf = sys.stdin
else:
inf = file(args[0])
print generate_checksum(inf)
if __name__ == "__main__":
main(sys.argv[1:])
--- NEW FILE: mboxcount.py ---
#! /usr/bin/env python
"""Count the number of messages in Unix mboxes.
Usage: %(programs)s [-g] [-h] path1 ...
Options:
-h
Print this help message and exit
-g
Do globbing on each path. This is helpful on Windows, where the
native shells don't glob.
"""
"""
Stats for Barry's corpora, as of 26-Aug-2002, using then-current 2.3a0:
edu-sig-clean.mbox 252 (+ unparseable: 0)
python-dev-clean.mbox 8326 (+ unparseable: 0)
mailman-developers-clean.mbox 2427 (+ unparseable: 0)
python-list-clean.mbox 159072 (+ unparseable: 2)
zope3-clean.mbox 2177 (+ unparseable: 0)
Unparseable messages are likely spam.
zope3-clean.mbox is really from the zope3-dev mailing list.
The Python version matters because the email package varies across releases
in whether it uses strict or lax parsing.
"""
import sys
import mailbox
import email
import getopt
import glob
from mboxutils import get_message
try:
True, False
except NameError:
# Maintain compatibility with Python 2.2
True, False = 1, 0
program = sys.argv[0]
def usage(code, msg=''):
print >> sys.stderr, __doc__
if msg:
print >> sys.stderr, msg
sys.exit(code)
def count(fname):
fp = open(fname, 'rb')
mbox = mailbox.PortableUnixMailbox(fp, get_message)
goodcount = 0
badcount = 0
for msg in mbox:
if msg["to"] is None and msg["cc"] is None:
badcount += 1
else:
goodcount += 1
fp.close()
return goodcount, badcount
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'hg', ['help'])
except getopt.error, msg:
usage(1, msg)
doglob = False
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
elif opt == '-g':
doglob = True
for path in args:
if doglob:
fnames = glob.glob(path)
else:
fnames = [path]
for fname in fnames:
goodn, badn = count(fname)
print "%-35s %7d (+ unparseable: %d)" % (fname, goodn, badn)
if __name__ == '__main__':
main()
--- NEW FILE: rebal.py ---
#!/usr/bin/env python
"""
rebal.py - rebalance a ham or spam directory, moving files to or from
a reservoir directory as necessary.
usage: rebal.py [ options ]
options:
-d - dry run; display what would be moved, but don't do it [%(DRYRUN)s]
-r res - specify an alternate reservoir [%(RESDIR)s]
-s set - specify an alternate Set pfx [%(SETPFX)s]
-n num - specify number of files per Set dir desired [%(NPERDIR)s]
-v - tell user what's happening [%(VERBOSE)s]
-q - be quiet about what's happening [not %(VERBOSE)s]
-c - confirm file moves into Set directory [%(CONFIRM)s]
-Q - don't confirm moves; this is independent of -v/-q
The script will work with a variable number of Set directories, but they
must already exist.
Example:
rebal.py -r reservoir -s Set -n 300
This will move random files between the directory 'reservoir' and the
various subdirectories prefixed with 'Set', making sure no more than 300
files are left in the 'Set' directories when finished.
Example:
Suppose you want to shuffle your Set files around, winding up with 300 files
in each one, you can execute:
rebal.py -n 0
rebal.py -n 300
The first run will move all files from the various Data/Ham/Set directories
to the Data/Ham/reservoir directory. The second run will randomly parcel
out 300 files to each of the Data/Ham/Set directories.
"""
import os
import sys
import random
import glob
import getopt
try:
True, False
except NameError:
# Maintain compatibility with Python 2.2
True, False = 1, 0
# defaults
NPERDIR = 4000
RESDIR = 'Data/Ham/reservoir'
SETPFX = 'Data/Ham/Set'
VERBOSE = True
CONFIRM = True
DRYRUN = False
def usage(msg):
msg = str(msg)
if msg:
print >> sys.stderr, msg
print >> sys.stderr, """\
usage: rebal.py [ options ]
options:
-d - dry run; display what would be moved, but don't do it [%(DRYRUN)s]
-r res - specify an alternate reservoir [%(RESDIR)s]
-s set - specify an alternate Set pfx [%(SETPFX)s]
-n num - specify number of files per dir [%(NPERDIR)s]
-v - tell user what's happening [%(VERBOSE)s]
-q - be quiet about what's happening [not %(VERBOSE)s]
-c - confirm file moves into Set directory [%(CONFIRM)s]
-Q - be quiet and don't confirm moves
""" % globals()
def migrate(f, dir, verbose):
"""rename f into dir, making sure to avoid name clashes."""
base = os.path.split(f)[-1]
out = os.path.join(dir, base)
while os.path.exists(out):
basename, ext = os.path.splitext(base)
digits = random.randrange(100000000)
out = os.path.join(dir, str(digits) + ext)
if verbose:
print "moving", f, "to", out
os.rename(f, out)
def main(args):
nperdir = NPERDIR
resdir = RESDIR
setpfx = SETPFX
verbose = VERBOSE
confirm = CONFIRM
dryrun = DRYRUN
try:
opts, args = getopt.getopt(args, "dr:s:n:vqcQh")
except getopt.GetoptError, msg:
usage(msg)
return 1
for opt, arg in opts:
if opt == "-n":
nperdir = int(arg)
elif opt == "-r":
resdir = arg
elif opt == "-s":
setpfx = arg
elif opt == "-v":
verbose = True
elif opt == "-c":
confirm = True
elif opt == "-q":
verbose = False
elif opt == "-Q":
confirm = False
elif opt == "-d":
dryrun = True
elif opt == "-h":
usage('')
return 0
res = os.listdir(resdir)
dirs = glob.glob(setpfx+"*")
if dirs == []:
print >> sys.stderr, "no directories beginning with", setpfx, "exist."
return 1
stuff = []
n = len(res)
for dir in dirs:
fs = os.listdir(dir)
n += len(fs)
stuff.append((dir, fs))
if nperdir * len(dirs) > n:
print >> sys.stderr, "not enough files to go around - use lower -n."
return 1
# weak check against mixing ham and spam
if (setpfx.find("Ham") >= 0 and resdir.find("Spam") >= 0 or
setpfx.find("Spam") >= 0 and resdir.find("Ham") >= 0):
yn = raw_input("Reservoir and Set dirs appear not to match. "
"Continue? (y/n) ")
if yn.lower()[0:1] != 'y':
return 1
# if necessary, migrate random files to the reservoir
for (dir, fs) in stuff:
if nperdir >= len(fs):
continue
random.shuffle(fs)
movethese = fs[nperdir:]
del fs[nperdir:]
if dryrun:
print "would move", len(movethese), "files from", dir, \
"to reservoir", resdir
else:
for f in movethese:
migrate(os.path.join(dir, f), resdir, verbose)
res.extend(movethese)
# randomize reservoir once so we can just bite chunks from the front
random.shuffle(res)
# grow Set* directories from the reservoir
for (dir, fs) in stuff:
if nperdir == len(fs):
continue
movethese = res[:nperdir-len(fs)]
res = res[nperdir-len(fs):]
if dryrun:
print "would move", len(movethese), "files from reservoir", \
resdir, "to", dir
else:
for f in movethese:
if confirm:
print file(os.path.join(resdir, f)).read()
ok = raw_input('good enough? ').lower()
if not ok.startswith('y'):
continue
migrate(os.path.join(resdir, f), dir, verbose)
fs.extend(movethese)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
--- NEW FILE: split.py ---
#! /usr/bin/env python
"""Split an mbox into two files based on a given percentage.
This script will troll through a Unix mbox file randomly assigning each
message to one of two bins. The split is based on a given float percentage.
E.g.
% split.py sourcembox 20 mbox20 mbox80
yields two mbox files, where mbox20 contains approximately 20% of the messages
and mbox80 contains 80% of the messages. Messages are assigned to each bin
randomly.
Usage: %(programs)s [options] sourcembox percent file1 file2
Options:
-h / --help
Print this help message and exit
file1 and file2 are where the output goes. Approximately percent % of
messages will go to file1 and (100 - percent) % of messages will go to file2.
percent is a floating point number between 1 and 99. sourcembox is a Unix
mailbox file. All arguments except -h/--help are required.
"""
import sys
import random
import mailbox
import email
import getopt
import mboxutils
program = sys.argv[0]
def usage(code, msg=''):
print >> sys.stderr, __doc__
if msg:
print >> sys.stderr, msg
sys.exit(code)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'h', ['help'])
except getopt.error, msg:
usage(1, msg)
bin1 = bin2 = percentage = mboxfile = None
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
try:
mboxfile = args[0]
percent = float(args[1])
if not (0 < percent < 100):
raise ValueError
percent /= 100.0
bin1 = args[2]
bin2 = args[3]
except IndexError:
usage(1, 'Not enough arguments')
except ValueError:
usage(1, 'Percent argument must be a float between 1.0 and 99.0')
# Cruise
bin1out = open(bin1, 'wb')
bin2out = open(bin2, 'wb')
infp = open(mboxfile, 'rb')
mbox = mailbox.PortableUnixMailbox(infp, mboxutils.get_message)
for msg in mbox:
if random.random() < percent:
outfp = bin1out
else:
outfp = bin2out
astext = str(msg)
assert astext.endswith('\n')
outfp.write(astext)
outfp.close()
bin1out.close()
bin2out.close()
if __name__ == '__main__':
main()
--- NEW FILE: splitn.py ---
#! /usr/bin/env python
"""Split an mbox into N random mboxes.
Usage: %(program)s [-h] [-s seed] [-v] -n N sourcembox outfilebase
Options:
-h / --help
Print this help message and exit
-s seed
Seed the random number generator with seed (an integer).
By default, use system time at startup to seed.
-v
Verbose. Displays a period for each 100 messages parsed.
May display other stuff.
-n N
The number of output mboxes desired. This is required.
Arguments:
sourcembox
The mbox to split.
outfilebase
The base path + name prefix for each of the N output files.
Output mboxes have names of the form
outfilebase + ("%%d.mbox" %% i)
Example:
%(program)s -s 123 -n5 spam.mbox rspam
produces 5 mboxes, named rspam1.mbox through rspam5.mbox. Each contains
a random selection of the messages in spam.mbox, and together they contain
every message in spam.mbox exactly once. Each has approximately the same
number of messages. spam.mbox is not altered. In addition, the seed for
the random number generator is forced to 123, so that while the split is
random, it's reproducible.
"""
import sys
import random
import mailbox
import email
import getopt
import mboxutils
try:
True, False
except NameError:
# Maintain compatibility with Python 2.2
True, False = 1, 0
program = sys.argv[0]
def usage(code, msg=''):
print >> sys.stderr, __doc__ % globals()
if msg:
print >> sys.stderr, msg
sys.exit(code)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'hn:s:v', ['help'])
except getopt.error, msg:
usage(1, msg)
n = None
verbose = False
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
elif opt == '-s':
random.seed(int(arg))
elif opt == '-n':
n = int(arg)
elif opt == '-v':
verbose = True
if n is None or n <= 1:
usage(1, "an -n value > 1 is required")
if len(args) != 2:
usage(1, "input mbox name and output base path are required")
inputpath, outputbasepath = args
infile = file(inputpath, 'rb')
outfiles = [file(outputbasepath + ("%d.mbox" % i), 'wb')
for i in range(1, n+1)]
mbox = mailbox.PortableUnixMailbox(infile, mboxutils.get_message)
counter = 0
for msg in mbox:
i = random.randrange(n)
astext = str(msg)
outfiles[i].write(astext)
counter += 1
if verbose:
if counter % 100 == 0:
print '.',
if verbose:
print
print counter, "messages split into", n, "files"
infile.close()
for f in outfiles:
f.close()
if __name__ == '__main__':
main()
--- NEW FILE: splitndirs.py ---
#! /usr/bin/env python
"""Split an mbox into N random directories of files.
Usage: %(program)s [-h] [-g] [-s seed] [-v] -n N sourcembox ... outdirbase
Options:
-h / --help
Print this help message and exit
-g
Do globbing on each sourcepath. This is helpful on Windows, where
the native shells don't glob, or when you have more mboxes than
your shell allows you to specify on the commandline.
-s seed
Seed the random number generator with seed (an integer).
By default, use system time at startup to seed.
-v
Verbose. Displays a period for each 100 messages parsed.
May display other stuff.
-n N
The number of output mboxes desired. This is required.
Arguments:
sourcembox
The mbox or path to an mbox to split.
outdirbase
The base path + name prefix for each of the N output dirs.
Output files have names of the form
outdirbase + ("Set%%d/%%d" %% (i, n))
Example:
%(program)s -s 123 -n5 Data/spam.mbox Data/Spam/Set
produces 5 directories, named Data/Spam/Set1 through Data/Spam/Set5. Each
contains a random selection of the messages in spam.mbox, and together
they contain every message in spam.mbox exactly once. Each has
approximately the same number of messages. spam.mbox is not altered. In
addition, the seed for the random number generator is forced to 123, so
that while the split is random, it's reproducible.
"""
import sys
import os
import random
import mailbox
import email
import getopt
import glob
import mboxutils
try:
True, False
except NameError:
# Maintain compatibility with Python 2.2
True, False = 1, 0
program = sys.argv[0]
def usage(code, msg=''):
print >> sys.stderr, __doc__ % globals()
if msg:
print >> sys.stderr, msg
sys.exit(code)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'hgn:s:v', ['help'])
except getopt.error, msg:
usage(1, msg)
doglob = False
n = None
verbose = False
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
elif opt == '-g':
doglob = True
elif opt == '-s':
random.seed(int(arg))
elif opt == '-n':
n = int(arg)
elif opt == '-v':
verbose = True
if n is None or n <= 1:
usage(1, "an -n value > 1 is required")
if len(args) < 2:
usage(1, "input mbox name and output base path are required")
inputpaths, outputbasepath = args[:-1], args[-1]
outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)]
for dir in outdirs:
if not os.path.isdir(dir):
os.makedirs(dir)
counter = 0
for inputpath in inputpaths:
if doglob:
inpaths = glob.glob(inputpath)
else:
inpaths = [inputpath]
for inpath in inpaths:
mbox = mboxutils.getmbox(inpath)
for msg in mbox:
i = random.randrange(n)
astext = str(msg)
#assert astext.endswith('\n')
counter += 1
msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
msgfile.write(astext)
msgfile.close()
if verbose:
if counter % 100 == 0:
sys.stdout.write('.')
sys.stdout.flush()
if verbose:
print
print counter, "messages split into", n, "directories"
if __name__ == '__main__':
main()
- Previous message: [Spambayes-checkins]
spambayes/testtools cmp.py,NONE,1.1.2.1 fpfn.py,NONE,1.1.2.1
mboxtest.py,NONE,1.1.2.1 rates.py,NONE,1.1.2.1
simplexloop.py,NONE,1.1.2.1 table.py,NONE,1.1.2.1
timcv.py,NONE,1.1.2.1 timtest.py,NONE,1.1.2.1weaktest.py,NONE,1.1.2.1
- Next message: [Spambayes-checkins]
website/pics gutter-hi.png,NONE,1.1 gutter.png,NONE,1.1 logo.png,NONE,1.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Spambayes-checkins
mailing list