[Spambayes-checkins] spambayes/contrib tte.py,1.12,1.13

Mon Jul 26 04:46:52 CEST 2004

Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12291

Modified Files:
	tte.py 
Log Message:
Generalize the spam:ham ratio flag to include the ham value instead of
having it be implicitly 1.

Index: tte.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/tte.py,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -d -r1.12 -r1.13
*** tte.py	25 Jul 2004 03:00:53 -0000	1.12
--- tte.py	26 Jul 2004 02:46:49 -0000	1.13
***************
*** 36,43 ****
  -R        Walk backwards through the mailbox.

! --ratio=n Define the number of spam messages to be trained for each ham.
!           The default is 1, but given the sorry state of the Net's email
!           infrastructure these days you'll probably want to raise it.  Keep
!           it as close to 1 as you can...

  Note: The -c command line argument isn't quite as benign as it might first
--- 36,43 ----
  -R        Walk backwards through the mailbox.

! --ratio=n Define the ratio of spam:ham messages to be trained at once.
!           The default is 1:1, but given the sorry state of the Net's email
!           infrastructure these days you'll probably want to raise it (3:2 or
!           2:1, etc).  Keep it as close to 1 as you can...

  Note: The -c command line argument isn't quite as benign as it might first
***************
*** 101,104 ****
--- 101,106 ----
      spam_cutoff = Options.options["Categorization", "spam_cutoff"]

+     nspam, nham = ratio
+ 
      while round < maxrounds and (hmisses or smisses or round == 0):
          hambone = mboxutils.getmbox(hambox)
***************
*** 116,122 ****
          try:
              while not maxmsgs or nmsgs < maxmsgs:
!                 ham = hambone.next()
                  spams = []
!                 for i in range(ratio):
                      try:
                          spams.append(spamcan.next())
--- 118,134 ----
          try:
              while not maxmsgs or nmsgs < maxmsgs:
!                 hams = []
!                 for i in range(nham):
!                     try:
!                         hams.append(hambone.next())
!                     except StopIteration:
!                         # no hams left so exit
!                         if not hams:
!                             raise
!                         # use what we've collected
!                         break
! 
                  spams = []
!                 for i in range(nspam):
                      try:
                          spams.append(spamcan.next())
***************
*** 128,144 ****
                          break

!                 nmsgs += 1 + len(spams)
                  sys.stdout.write("\r%5d" % nmsgs)
                  sys.stdout.flush()

!                 score = store.spamprob(tokenize(ham))
!                 selector = ham["message-id"] or ham["subject"]
!                 if score > ham_cutoff and selector is not None:
!                     if verbose:
!                         print >> sys.stderr, "miss ham:  %.6f %s" % (
!                             score, selector)
!                     hmisses += 1
!                     tdict[ham["message-id"]] = True
!                     store.learn(tokenize(ham), False)

                  for spam in spams:
--- 140,157 ----
                          break

!                 nmsgs += len(hams) + len(spams)
                  sys.stdout.write("\r%5d" % nmsgs)
                  sys.stdout.flush()

!                 for ham in hams:
!                     score = store.spamprob(tokenize(ham))
!                     selector = ham["message-id"] or ham["subject"]
!                     if score > ham_cutoff and selector is not None:
!                         if verbose:
!                             print >> sys.stderr, "miss ham: %.6f %s" % (
!                                 score, selector)
!                         hmisses += 1
!                         tdict[ham["message-id"]] = True
!                         store.learn(tokenize(ham), False)

                  for spam in spams:
***************
*** 200,204 ****
      verbose = False
      reverse = False
!     sh_ratio = 1
      for opt, arg in opts:
          if opt in ("-h", "--help"):
--- 213,217 ----
      verbose = False
      reverse = False
!     sh_ratio = (1, 1)
      for opt, arg in opts:
          if opt in ("-h", "--help"):
***************
*** 222,226 ****
              Options.options.set_from_cmdline(arg, sys.stderr)
          elif opt == '--ratio':
!             sh_ratio = int(arg)

      if ham is None or spam is None:
--- 235,240 ----
              Options.options.set_from_cmdline(arg, sys.stderr)
          elif opt == '--ratio':
!             arg = arg.split(":")
!             sh_ratio = (int(arg[0]), int(arg[1]))

      if ham is None or spam is None: