[Python-checkins] python/nondist/sandbox/spambayes split.py,NONE,1.1
bwarsaw@users.sourceforge.net
bwarsaw@users.sourceforge.net
Tue, 20 Aug 2002 14:18:32 -0700
Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv28692
Added Files:
split.py
Log Message:
A simple mailbox splitter
--- NEW FILE: split.py ---
#! /usr/bin/env python
"""Split an mbox into two files based on a given percentage.
This script will troll through a Unix mbox file randomly assigning each
message to one of two bins. The split is based on a given float percentage.
E.g.
% split.py -1 mbox20 -2 mbox80 20 sourcembox
yields two mbox files, where mbox20 contains approximately 20% of the messages
and mbox80 contains 80% of the messages. Messages are assigned to each bin
randomly.
Usage: %(programs)s -1 file -2 file [options] percent sourcembox
Options:
-h / --help
Print this help message and exit
-1 file
Names the first output file. Approximately percent % of the messages
from the original source file will end up in this collection.
-2 file
Names the second output file. Approximately 100-percent % of the
messages from the original source file will end up in this
collection.
percent is a floating point number between 1 and 99. sourcembox is a Unix
mailbox file. All arguments except -h/--help are required.
"""
import sys
import random
import mailbox
import email
import getopt
program = sys.argv[0]
def usage(code, msg=''):
print >> sys.stderr, __doc__
if msg:
print >> sys.stderr, msg
sys.exit(code)
def _factory(fp):
try:
return email.message_from_file(fp)
except email.Errors.MessageParseError:
return None
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'h1:2:', ['help'])
except getopt.error, msg:
usage(1, msg)
bin1 = bin2 = percentage = mboxfile = None
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
elif opt == '-1':
bin1 = arg
elif opt == '-2':
bin2 = arg
if bin1 is None or bin2 is None:
usage(1, 'Both options -1 and -2 are required')
try:
percent = float(args[0])
if not (0 < percent < 100):
raise ValueError
percent /= 100.0
mboxfile = args[1]
except IndexError:
usage(1, 'Not enough arguments')
except ValueError:
usage(1, 'Percent argument must be a float between 1.0 and 99.0')
# Cruise
bin1out = open(bin1, 'w')
bin2out = open(bin2, 'w')
infp = open(mboxfile)
mbox = mailbox.PortableUnixMailbox(infp, _factory)
for msg in mbox:
if random.random() < percent:
outfp = bin1out
else:
outfp = bin2out
print >> outfp, msg
outfp.close()
bin1out.close()
bin2out.close()
if __name__ == '__main__':
main()