[Spambayes-checkins] spambayes/contrib tte.py,1.17,1.18

Fri May 11 02:23:10 CEST 2007

Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv27616/contrib

Modified Files:
	tte.py 
Log Message:
patch 1707808 from Dave Abrahams - tte patch for imap

Index: tte.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/tte.py,v
retrieving revision 1.17
retrieving revision 1.18
diff -C2 -d -r1.17 -r1.18
*** tte.py	5 Aug 2006 12:48:09 -0000	1.17
--- tte.py	11 May 2007 00:23:07 -0000	1.18
***************
*** 29,32 ****
--- 29,35 ----
            reduce the amount of input) are retained.

+ -C      - Cull all messages which aren't used as training input during any
+           run by marking them deleted.  Only works with IMAP folders.
+ 
  -o sect:opt:val -
            Set [sect, opt] in the options database to val.
***************
*** 95,98 ****
--- 98,102 ----
          return iter(seq)

+ 
  def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose,
            ratio):
***************
*** 101,107 ****
      spam_cutoff = Options.options["Categorization", "spam_cutoff"]

!     nspam, nham = ratio

!     while round < maxrounds and (hmisses or smisses or round == 0):
          round += 1
          if verbose:
--- 105,133 ----
      spam_cutoff = Options.options["Categorization", "spam_cutoff"]

!     # list-ify ham and spam iterators immediately.  We don't really want to
!     # fetch the messages multiple times, and this is no worse than what happened
!     # before when -R was passed.
!     hambone_ = list(mboxutils.getmbox(hambox))
!     spamcan_ = list(mboxutils.getmbox(spambox))

!     if reverse:
!         hambone_ = list(reversed(hambone_))
!         spamcan_ = list(reversed(spamcan_))
!     
!     if ratio:
!         rspam,rham = ratio
!     else:
!         rspam,rham = len(spamcan_),len(hambone_)
! 
!     # define some indexing constants
!     ham = 0
!     spam = 1
!     name = ('ham','spam')
!     misses = [0,0]
! 
!     misclassified = lambda is_spam, score: (
!         is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff)
! 
!     while round < maxrounds and (misses[ham] or misses[spam] or round == 0):
          round += 1
          if verbose:
***************
*** 109,173 ****

          start = datetime.datetime.now()
!         hambone = mboxutils.getmbox(hambox)
!         spamcan = mboxutils.getmbox(spambox)
!         if reverse:
!             hambone = reversed(list(hambone))
!             spamcan = reversed(list(spamcan))

!         hmisses = smisses = nmsgs = 0
!         try:
!             while not maxmsgs or nmsgs < maxmsgs:
!                 hams = []
!                 for i in range(nham):
!                     try:
!                         hams.append(hambone.next())
!                     except StopIteration:
!                         # no hams left so exit
!                         if not hams:
!                             raise
!                         # use what we've collected
!                         break

!                 spams = []
!                 for i in range(nspam):
!                     try:
!                         spams.append(spamcan.next())
!                     except StopIteration:
!                         # no spams left so exit
!                         if not spams:
!                             raise
!                         # use what we've collected
!                         break

!                 nmsgs += len(hams) + len(spams)
!                 sys.stdout.write("\r%5d" % nmsgs)
!                 sys.stdout.flush()

!                 for (ham, spam) in map(None, hams, spams):
!                     if ham is not None:
!                         score = store.spamprob(tokenize(ham))
!                         selector = ham["message-id"] or ham["subject"]
!                         if score > ham_cutoff and selector is not None:
!                             if verbose:
!                                 print >> sys.stderr, "miss ham: %.6f %s" % (
!                                     score, selector)
!                             hmisses += 1
!                             tdict[ham["message-id"]] = True
!                             store.learn(tokenize(ham), False)

!                     if spam is not None:
!                         score = store.spamprob(tokenize(spam))
!                         selector = (spam["message-id"] or
!                                     spam["subject"])
!                         if score < spam_cutoff and selector is not None:
!                             if verbose:
!                                 print >> sys.stderr, "miss spam: %.6f %s" % (
!                                     score, selector)
!                             smisses += 1
!                             tdict[spam["message-id"]] = True
!                             store.learn(tokenize(spam), True)

!         except StopIteration:
!             pass

          delta = datetime.datetime.now()-start
--- 135,173 ----

          start = datetime.datetime.now()
!         hambone = iter(hambone_)
!         spamcan = iter(spamcan_)

!         i = [0,0]
!         msgs_processed = 0
!         misses = [0,0]
!         training_sets = [hambone, spamcan]

!         while not maxmsgs or msgs_processed < maxmsgs:

!             # should the next message come from hambone or spamcan?
!             train_spam = i[ham] * rspam > i[spam] * rham

!             try:
!                 train_msg = training_sets[train_spam].next()
!             except StopIteration:
!                 break;

!             i[train_spam] += 1
!             msgs_processed += 1
!             sys.stdout.write("\r%5d" % msgs_processed)
!             sys.stdout.flush()

!             tokens = list(tokenize(train_msg))
!             score = store.spamprob(tokens)
!             selector = train_msg["message-id"] or train_msg["subject"]
! 
!             if misclassified(train_spam,score) and selector is not None:
!                 if verbose:
!                     print >> sys.stderr, "\tmiss %s: %.6f %s" % (
!                         name[train_spam], score, selector)
! 
!                 misses[train_spam] += 1
!                 tdict[train_msg["message-id"]] = True
!                 store.learn(tokens, train_spam)

          delta = datetime.datetime.now()-start
***************
*** 175,212 ****

          print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \
!               (round, nmsgs, hmisses, smisses, seconds)

      # We count all untrained messages so the user knows what was skipped.
      # We also tag them for saving so we don't lose messages which might have
      # value in a future run
!     nhamleft = 0
!     try:
!         while True:
!             msg = hambone.next()
!             score = store.spamprob(tokenize(msg))
!             if score > ham_cutoff:
!                 tdict[msg["message-id"]] = True
!                 nhamleft += 1
!     except StopIteration:
!         if nhamleft: print nhamleft, "untrained hams"
! 
!     nspamleft = 0
!     try:
!         while True:
!             msg = spamcan.next()
!             score = store.spamprob(tokenize(msg))
!             if score < spam_cutoff:
!                 tdict[msg["message-id"]] = True
!                 nspamleft += 1
!     except StopIteration:
!         if nspamleft: print nspamleft, "untrained spams"

  def main(args):
      try:
!         opts, args = getopt.getopt(args, "hg:s:d:p:o:m:r:c:vR",
                                     ["help", "good=", "spam=",
                                      "database=", "pickle=", "verbose",
                                      "option=", "max=", "maxrounds=",
!                                     "cullext=", "reverse", "ratio="])
      except getopt.GetoptError, msg:
          usage(msg)
--- 175,232 ----

          print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \
!               (round, msgs_processed, misses[0], misses[1], seconds)

+     training_sets = [hambone,spamcan]
+     
      # We count all untrained messages so the user knows what was skipped.
      # We also tag them for saving so we don't lose messages which might have
      # value in a future run
!     for is_spam in ham,spam:
!         nleft = 0
!         try:
!             while True:
!                 msg = training_sets[is_spam].next()
!                 score = store.spamprob(tokenize(msg))
!                 
!                 if misclassified(is_spam,score):
!                     tdict[msg["message-id"]] = True
!                     nleft += 1
!                     
!         except StopIteration:
!             if nleft: print nleft, "untrained %ss" % name[is_spam]

+ def cull(mbox_name, cullext, designation, tdict):
+     print "writing new %s mbox..." % designation
+     n = m = 0
+     if cullext:
+         culled_mbox = file(mbox_name + cullext, "w")
+         
+     for msg in mboxutils.getmbox(mbox_name):
+         m += 1
+         if msg["message-id"] in tdict:
+             if cullext:
+                 culled_mbox.write(str(msg))
+             n += 1
+         elif not cullext:
+             response = msg.imap_server.uid(
+                 "STORE", msg.uid, "+FLAGS.SILENT", "(\\Deleted \\Seen)")
+             command = "set %s to be deleted and seen" % (msg.uid,)
+             msg.imap_server.check_response(command, response)
+         
+         sys.stdout.write("\r%5d of %5d" % (n, m))
+         sys.stdout.flush()
+         
+     sys.stdout.write("\n")
+     
+     if cullext:
+         culled_mbox.close()
+     
  def main(args):
      try:
!         opts, args = getopt.getopt(args, "hg:s:d:p:o:m:r:c:vRuC",
                                     ["help", "good=", "spam=",
                                      "database=", "pickle=", "verbose",
                                      "option=", "max=", "maxrounds=",
!                                     "cullext=", "reverse", "ratio=", "unbalanced"])
      except getopt.GetoptError, msg:
          usage(msg)
***************
*** 231,234 ****
--- 251,256 ----
          elif opt in ("-c", "--cullext"):
              cullext = arg
+         elif opt in ("-C", "--cullext"):
+             cullext = ''
          elif opt in ("-m", "--max"):
              maxmsgs = int(arg)
***************
*** 237,240 ****
--- 259,264 ----
          elif opt in ("-R", "--reverse"):
              reverse = True
+         elif opt in ("-u", "--unbalanced"):
+             sh_ratio = None
          elif opt in ('-o', '--option'):
              Options.options.set_from_cmdline(arg, sys.stderr)
***************
*** 260,291 ****
            sh_ratio)

      store.close()

      if cullext is not None:
!         print "writing new ham mbox..."
!         n = m = 0
!         newham = file(ham + cullext, "w")
!         for msg in mboxutils.getmbox(ham):
!             m += 1
!             if msg["message-id"] in tdict:
!                 newham.write(str(msg))
!                 n += 1
!             sys.stdout.write("\r%5d of %5d" % (n, m))
!             sys.stdout.flush()
!         sys.stdout.write("\n")
!         newham.close()
! 
!         print "writing new spam mbox..."
!         n = m = 0
!         newspam = file(spam + cullext, "w")
!         for msg in mboxutils.getmbox(spam):
!             m += 1
!             if msg["message-id"] in tdict:
!                 newspam.write(str(msg))
!                 n += 1
!             sys.stdout.write("\r%5d of %5d" % (n, m))
!             sys.stdout.flush()
!         sys.stdout.write("\n")
!         newspam.close()

      return 0
--- 284,293 ----
            sh_ratio)

+     store.store()
      store.close()

      if cullext is not None:
!         cull(ham, cullext, 'ham', tdict)
!         cull(spam, cullext, 'spam', tdict)

      return 0