[Spambayes-checkins] spambayes/contrib tte.py,1.17,1.18
Skip Montanaro
montanaro at users.sourceforge.net
Fri May 11 02:23:10 CEST 2007
Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv27616/contrib
Modified Files:
tte.py
Log Message:
patch 1707808 from Dave Abrahams - tte patch for imap
Index: tte.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/tte.py,v
retrieving revision 1.17
retrieving revision 1.18
diff -C2 -d -r1.17 -r1.18
*** tte.py 5 Aug 2006 12:48:09 -0000 1.17
--- tte.py 11 May 2007 00:23:07 -0000 1.18
***************
*** 29,32 ****
--- 29,35 ----
reduce the amount of input) are retained.
+ -C - Cull all messages which aren't used as training input during any
+ run by marking them deleted. Only works with IMAP folders.
+
-o sect:opt:val -
Set [sect, opt] in the options database to val.
***************
*** 95,98 ****
--- 98,102 ----
return iter(seq)
+
def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose,
ratio):
***************
*** 101,107 ****
spam_cutoff = Options.options["Categorization", "spam_cutoff"]
! nspam, nham = ratio
! while round < maxrounds and (hmisses or smisses or round == 0):
round += 1
if verbose:
--- 105,133 ----
spam_cutoff = Options.options["Categorization", "spam_cutoff"]
! # list-ify ham and spam iterators immediately. We don't really want to
! # fetch the messages multiple times, and this is no worse than what happened
! # before when -R was passed.
! hambone_ = list(mboxutils.getmbox(hambox))
! spamcan_ = list(mboxutils.getmbox(spambox))
! if reverse:
! hambone_ = list(reversed(hambone_))
! spamcan_ = list(reversed(spamcan_))
!
! if ratio:
! rspam,rham = ratio
! else:
! rspam,rham = len(spamcan_),len(hambone_)
!
! # define some indexing constants
! ham = 0
! spam = 1
! name = ('ham','spam')
! misses = [0,0]
!
! misclassified = lambda is_spam, score: (
! is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff)
!
! while round < maxrounds and (misses[ham] or misses[spam] or round == 0):
round += 1
if verbose:
***************
*** 109,173 ****
start = datetime.datetime.now()
! hambone = mboxutils.getmbox(hambox)
! spamcan = mboxutils.getmbox(spambox)
! if reverse:
! hambone = reversed(list(hambone))
! spamcan = reversed(list(spamcan))
! hmisses = smisses = nmsgs = 0
! try:
! while not maxmsgs or nmsgs < maxmsgs:
! hams = []
! for i in range(nham):
! try:
! hams.append(hambone.next())
! except StopIteration:
! # no hams left so exit
! if not hams:
! raise
! # use what we've collected
! break
! spams = []
! for i in range(nspam):
! try:
! spams.append(spamcan.next())
! except StopIteration:
! # no spams left so exit
! if not spams:
! raise
! # use what we've collected
! break
! nmsgs += len(hams) + len(spams)
! sys.stdout.write("\r%5d" % nmsgs)
! sys.stdout.flush()
! for (ham, spam) in map(None, hams, spams):
! if ham is not None:
! score = store.spamprob(tokenize(ham))
! selector = ham["message-id"] or ham["subject"]
! if score > ham_cutoff and selector is not None:
! if verbose:
! print >> sys.stderr, "miss ham: %.6f %s" % (
! score, selector)
! hmisses += 1
! tdict[ham["message-id"]] = True
! store.learn(tokenize(ham), False)
! if spam is not None:
! score = store.spamprob(tokenize(spam))
! selector = (spam["message-id"] or
! spam["subject"])
! if score < spam_cutoff and selector is not None:
! if verbose:
! print >> sys.stderr, "miss spam: %.6f %s" % (
! score, selector)
! smisses += 1
! tdict[spam["message-id"]] = True
! store.learn(tokenize(spam), True)
! except StopIteration:
! pass
delta = datetime.datetime.now()-start
--- 135,173 ----
start = datetime.datetime.now()
! hambone = iter(hambone_)
! spamcan = iter(spamcan_)
! i = [0,0]
! msgs_processed = 0
! misses = [0,0]
! training_sets = [hambone, spamcan]
! while not maxmsgs or msgs_processed < maxmsgs:
! # should the next message come from hambone or spamcan?
! train_spam = i[ham] * rspam > i[spam] * rham
! try:
! train_msg = training_sets[train_spam].next()
! except StopIteration:
! break;
! i[train_spam] += 1
! msgs_processed += 1
! sys.stdout.write("\r%5d" % msgs_processed)
! sys.stdout.flush()
! tokens = list(tokenize(train_msg))
! score = store.spamprob(tokens)
! selector = train_msg["message-id"] or train_msg["subject"]
!
! if misclassified(train_spam,score) and selector is not None:
! if verbose:
! print >> sys.stderr, "\tmiss %s: %.6f %s" % (
! name[train_spam], score, selector)
!
! misses[train_spam] += 1
! tdict[train_msg["message-id"]] = True
! store.learn(tokens, train_spam)
delta = datetime.datetime.now()-start
***************
*** 175,212 ****
print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \
! (round, nmsgs, hmisses, smisses, seconds)
# We count all untrained messages so the user knows what was skipped.
# We also tag them for saving so we don't lose messages which might have
# value in a future run
! nhamleft = 0
! try:
! while True:
! msg = hambone.next()
! score = store.spamprob(tokenize(msg))
! if score > ham_cutoff:
! tdict[msg["message-id"]] = True
! nhamleft += 1
! except StopIteration:
! if nhamleft: print nhamleft, "untrained hams"
!
! nspamleft = 0
! try:
! while True:
! msg = spamcan.next()
! score = store.spamprob(tokenize(msg))
! if score < spam_cutoff:
! tdict[msg["message-id"]] = True
! nspamleft += 1
! except StopIteration:
! if nspamleft: print nspamleft, "untrained spams"
def main(args):
try:
! opts, args = getopt.getopt(args, "hg:s:d:p:o:m:r:c:vR",
["help", "good=", "spam=",
"database=", "pickle=", "verbose",
"option=", "max=", "maxrounds=",
! "cullext=", "reverse", "ratio="])
except getopt.GetoptError, msg:
usage(msg)
--- 175,232 ----
print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \
! (round, msgs_processed, misses[0], misses[1], seconds)
+ training_sets = [hambone,spamcan]
+
# We count all untrained messages so the user knows what was skipped.
# We also tag them for saving so we don't lose messages which might have
# value in a future run
! for is_spam in ham,spam:
! nleft = 0
! try:
! while True:
! msg = training_sets[is_spam].next()
! score = store.spamprob(tokenize(msg))
!
! if misclassified(is_spam,score):
! tdict[msg["message-id"]] = True
! nleft += 1
!
! except StopIteration:
! if nleft: print nleft, "untrained %ss" % name[is_spam]
+ def cull(mbox_name, cullext, designation, tdict):
+ print "writing new %s mbox..." % designation
+ n = m = 0
+ if cullext:
+ culled_mbox = file(mbox_name + cullext, "w")
+
+ for msg in mboxutils.getmbox(mbox_name):
+ m += 1
+ if msg["message-id"] in tdict:
+ if cullext:
+ culled_mbox.write(str(msg))
+ n += 1
+ elif not cullext:
+ response = msg.imap_server.uid(
+ "STORE", msg.uid, "+FLAGS.SILENT", "(\\Deleted \\Seen)")
+ command = "set %s to be deleted and seen" % (msg.uid,)
+ msg.imap_server.check_response(command, response)
+
+ sys.stdout.write("\r%5d of %5d" % (n, m))
+ sys.stdout.flush()
+
+ sys.stdout.write("\n")
+
+ if cullext:
+ culled_mbox.close()
+
def main(args):
try:
! opts, args = getopt.getopt(args, "hg:s:d:p:o:m:r:c:vRuC",
["help", "good=", "spam=",
"database=", "pickle=", "verbose",
"option=", "max=", "maxrounds=",
! "cullext=", "reverse", "ratio=", "unbalanced"])
except getopt.GetoptError, msg:
usage(msg)
***************
*** 231,234 ****
--- 251,256 ----
elif opt in ("-c", "--cullext"):
cullext = arg
+ elif opt in ("-C", "--cullext"):
+ cullext = ''
elif opt in ("-m", "--max"):
maxmsgs = int(arg)
***************
*** 237,240 ****
--- 259,264 ----
elif opt in ("-R", "--reverse"):
reverse = True
+ elif opt in ("-u", "--unbalanced"):
+ sh_ratio = None
elif opt in ('-o', '--option'):
Options.options.set_from_cmdline(arg, sys.stderr)
***************
*** 260,291 ****
sh_ratio)
store.close()
if cullext is not None:
! print "writing new ham mbox..."
! n = m = 0
! newham = file(ham + cullext, "w")
! for msg in mboxutils.getmbox(ham):
! m += 1
! if msg["message-id"] in tdict:
! newham.write(str(msg))
! n += 1
! sys.stdout.write("\r%5d of %5d" % (n, m))
! sys.stdout.flush()
! sys.stdout.write("\n")
! newham.close()
!
! print "writing new spam mbox..."
! n = m = 0
! newspam = file(spam + cullext, "w")
! for msg in mboxutils.getmbox(spam):
! m += 1
! if msg["message-id"] in tdict:
! newspam.write(str(msg))
! n += 1
! sys.stdout.write("\r%5d of %5d" % (n, m))
! sys.stdout.flush()
! sys.stdout.write("\n")
! newspam.close()
return 0
--- 284,293 ----
sh_ratio)
+ store.store()
store.close()
if cullext is not None:
! cull(ham, cullext, 'ham', tdict)
! cull(spam, cullext, 'spam', tdict)
return 0
More information about the Spambayes-checkins
mailing list