[Spambayes-checkins] spambayes/utilities mkreversemap.py,NONE,1.1
Skip Montanaro
montanaro at users.sourceforge.net
Tue Jan 6 10:27:20 EST 2004
Update of /cvsroot/spambayes/spambayes/utilities
In directory sc8-pr-cvs1:/tmp/cvs-serv22747
Added Files:
mkreversemap.py
Log Message:
New script which generates a pickle file mapping features to mailbox files
and message-id's. Use with extractmessages.py.
--- NEW FILE: mkreversemap.py ---
#!/usr/bin/env python
"""
Create mapping from features to message ids
usage %(prog)s [ -h ] -t ham|spam -d mapfile mailbox ...
-d mapfile - identify file which will hold mapping information
-t ham|spam - identify the type of messages in the input mailbox(es)
-h - print this documentation and exit
"""
import sys
import getopt
import anydbm
import cPickle as pickle
from spambayes.mboxutils import getmbox
from spambayes.tokenizer import tokenize
prog = sys.argv[0]
def usage(msg=None):
if msg is not None:
print >> sys.stderr, msg
print >> sys.stderr, __doc__.strip() % globals()
def mapmessages(f, mboxtype, mapdb):
i = 0
for msg in getmbox(f):
i += 1
sys.stdout.write('\r%s: %d' % (f, i))
sys.stdout.flush()
msgid = msg.get("message-id")
if msgid is None:
continue
for t in tokenize(msg):
ham, spam = mapdb.get(t, ({}, {}))
if mboxtype == "ham":
msgids = ham.get(f, set())
msgids.add(msgid)
ham[f] = msgids
else:
msgids = spam.get(f, set())
msgids.add(msgid)
spam[f] = msgids
mapdb[t] = (ham, spam)
sys.stdout.write("\n")
def main(args):
try:
opts, args = getopt.getopt(args, "hd:t:",
["type=", "help", "database="])
except getopt.GetoptError, msg:
usage(msg)
return 1
mapfile = None
mboxtype = None
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
return 0
elif opt in ("-d", "--database"):
mapfile = arg
elif opt in ("-t", "--type"):
mboxtype = arg
if mapfile is None:
usage("'-d mapfile' is required")
return 1
if mboxtype is None:
usage("'-t ham|spam' is required")
return 1
if mboxtype not in ("ham", "spam"):
usage("mboxtype must be 'ham' or 'spam'")
return 1
try:
mapd = pickle.load(file(mapfile))
except IOError:
mapd = {}
for f in args:
mapmessages(f, mboxtype, mapd)
pickle.dump(mapd, file(mapfile, "w"))
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
More information about the Spambayes-checkins
mailing list