[Spambayes-checkins] spambayes/contrib tte.py,1.4,1.5

Wed Feb 25 17:19:46 EST 2004

Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv17329

Modified Files:
	tte.py 
Log Message:
Add a -c flag.  "-c ext" is used to trigger writing of a new pair of ham and
spam files (both unix mbox files) which contain just the messages which were
trained on at least once during the run.  This can be used to cull useless
messages from the data sets.

Index: tte.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/tte.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** tte.py	13 Feb 2004 19:43:36 -0000	1.4
--- tte.py	25 Feb 2004 22:19:43 -0000	1.5
***************
*** 5,26 ****
  everything scores properly.

! usage %(prog)s [ -h ] -g file -s file [ -d file | -p file ] [ -m N ] [ -r N ]

! -h      - print this documentation and exit.

! -g file - take ham from file

! -s file - take spam from file

! -d file - use a database-based classifier named file

! -p file - use a pickle-based classifier named file

! -m N    - train on at most N messages (nham == N/2 and nspam == N/2)

! -r N    - run at most N rounds (default %(MAXROUNDS)s), even if not
!           all messages score correctly

! See Gary Robinson's blog:

      http://www.garyrobinson.net/2004/02/spam_filtering_.html
--- 5,40 ----
  everything scores properly.

! usage %(prog)s [ -h ] -g file -s file [ -d file | -p file ] [ -m N ] [ -r N ] [ -c ext ]

! -h      - Print this usage message and exit.

! -g file - Take ham from file.

! -s file - Take spam from file.

! -d file - Use a database-based classifier named file.

! -p file - Use a pickle-based classifier named file.

! -m N    - Train on at most N messages (nham == N/2 and nspam == N/2).

! -r N    - Run at most N rounds (default %(MAXROUNDS)s), even if not
!           all messages score correctly.

! -c ext  - Cull all messages which aren't used as training input during any run
!           and write to new ham and spam files with ext as an extra file extension.
!           All messages which are never considered (because one training set is
!           longer than the other or the -m flag was used to reduce the amount of
!           input) are retained.
! 
! Note that the -c command line argument isn't quite as benign as it might
! first appear.  Since the tte protocol trains on the same number of ham and
! spam messages, if you use the output of one run as input into a later run
! you will almost certainly train on fewer messages than before since the two
! files will probably not have the same number of messages.  The extra
! messages in the longer file will be ignored in future runs until you add
! more messages to the shorter file.
! 
! For more detail on the notion of training to exhaustion see Gary Robinson's blog:

      http://www.garyrobinson.net/2004/02/spam_filtering_.html
***************
*** 48,52 ****
      print >> sys.stderr, __doc__.strip() % globals()

! def train(store, ham, spam, maxmsgs, maxrounds):
      smisses = hmisses = round = 0
      ham_cutoff = Options.options["Categorization", "ham_cutoff"]
--- 62,66 ----
      print >> sys.stderr, __doc__.strip() % globals()

! def train(store, ham, spam, maxmsgs, maxrounds, tdict):
      smisses = hmisses = round = 0
      ham_cutoff = Options.options["Categorization", "ham_cutoff"]
***************
*** 70,77 ****
--- 84,93 ----
                  if store.spamprob(tokenize(hammsg)) > ham_cutoff:
                      hmisses += 1
+                     tdict[hammsg["message-id"]] = True
                      store.learn(tokenize(hammsg), False)

                  if store.spamprob(tokenize(spammsg)) < spam_cutoff:
                      smisses += 1
+                     tdict[spammsg["message-id"]] = True
                      store.learn(tokenize(spammsg), True)

***************
*** 85,92 ****
                (round, nmsgs, hmisses, smisses, seconds)

      nhamleft = 0
      try:
          while True:
!             hambone.next()
              nhamleft += 1
      except StopIteration:
--- 101,112 ----
                (round, nmsgs, hmisses, smisses, seconds)

+     # We count all untrained messages so the user knows what was skipped.
+     # We also tag them for saving so we don't lose messages which might have
+     # value in a future run
      nhamleft = 0
      try:
          while True:
!             msg = hambone.next()
!             tdict[msg["message-id"]] = True
              nhamleft += 1
      except StopIteration:
***************
*** 96,100 ****
      try:
          while True:
!             spamcan.next()
              nspamleft += 1
      except StopIteration:
--- 116,121 ----
      try:
          while True:
!             msg = spamcan.next()
!             tdict[msg["message-id"]] = True
              nspamleft += 1
      except StopIteration:
***************
*** 103,115 ****
  def main(args):
      try:
!         opts, args = getopt.getopt(args, "hg:s:d:p:o:m:r:",
                                     ["help", "good=", "spam=",
                                      "database=", "pickle=",
!                                     "option=", "max=", "maxrounds="])
      except getopt.GetoptError, msg:
          usage(msg)
          return 1

!     ham = spam = dbname = usedb = None
      maxmsgs = 0
      maxrounds = MAXROUNDS
--- 124,137 ----
  def main(args):
      try:
!         opts, args = getopt.getopt(args, "hg:s:d:p:o:m:r:c:",
                                     ["help", "good=", "spam=",
                                      "database=", "pickle=",
!                                     "option=", "max=", "maxrounds=",
!                                     "cullext="])
      except getopt.GetoptError, msg:
          usage(msg)
          return 1

!     ham = spam = dbname = usedb = cullext = None
      maxmsgs = 0
      maxrounds = MAXROUNDS
***************
*** 122,125 ****
--- 144,149 ----
          elif opt in ("-s", "--spam"):
              spam = arg
+         elif opt in ("-c", "--cullext"):
+             cullext = arg
          elif opt in ("-m", "--max"):
              maxmsgs = int(arg)
***************
*** 142,149 ****
      store = storage.open_storage(dbname, usedb)

!     train(store, ham, spam, maxmsgs, maxrounds)

      store.store()

      return 0

--- 166,201 ----
      store = storage.open_storage(dbname, usedb)

!     tdict = {}
!     train(store, ham, spam, maxmsgs, maxrounds, tdict)

      store.store()

+     if cullext is not None:
+         print "writing new ham mbox..."
+         n = m = 0
+         newham = file(ham + cullext, "w")
+         for msg in mboxutils.getmbox(ham):
+             m += 1
+             if msg["message-id"] in tdict:
+                 newham.write(str(msg))
+                 n += 1
+             sys.stdout.write("\r%5d of %5d" % (n, m))
+             sys.stdout.flush()
+         sys.stdout.write("\n")
+         newham.close()
+ 
+         print "writing new spam mbox..."
+         n = m = 0
+         newspam = file(spam + cullext, "w")
+         for msg in mboxutils.getmbox(spam):
+             m += 1
+             if msg["message-id"] in tdict:
+                 newspam.write(str(msg))
+                 n += 1
+             sys.stdout.write("\r%5d of %5d" % (n, m))
+             sys.stdout.flush()
+         sys.stdout.write("\n")
+         newspam.close()
+ 
      return 0