[Spambayes-checkins] spambayes Options.py,1.15,1.16 mboxtest.py,1.5,1.6

Jeremy Hylton jhylton@users.sourceforge.net
Tue, 17 Sep 2002 08:29:48 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv29116

Modified Files:
	Options.py mboxtest.py 
Log Message:
Add three options for MboxTest.




Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** Options.py	15 Sep 2002 00:01:48 -0000	1.15
--- Options.py	17 Sep 2002 15:29:45 -0000	1.16
***************
*** 68,71 ****
--- 68,86 ----
  mine_received_headers: False
  
+ [MboxTest]
+ # If tokenize_header_words is true, then the header values are
+ # tokenized using the default text tokenize.  The words are tagged
+ # with "header:" where header is the name of the header.
+ tokenize_header_words: False
+ # If tokenize_header_default is True, use the base header tokenization
+ # logic described in the Tokenizer section.
+ tokenize_header_default: True
+ 
+ # skip_headers is a set of regular expressions describing headers that
+ # should not be tokenized if tokenize_header is True.
+ skip_headers: received
+     date
+     x-.*
+ 
  [TestDriver]
  # These control various displays in class TestDriver.Driver.
***************
*** 158,161 ****
--- 173,180 ----
                     'adjust_probs_by_evidence_mass': boolean_cracker,
                     },
+     'MboxTest': {'tokenize_header_words': boolean_cracker,
+                  'tokenize_header_default': boolean_cracker,
+                  'skip_headers': ('get', lambda s: Set(s.split())),
+                  },
  }
  

Index: mboxtest.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/mboxtest.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** mboxtest.py	14 Sep 2002 00:03:51 -0000	1.5
--- mboxtest.py	17 Sep 2002 15:29:45 -0000	1.6
***************
*** 22,25 ****
--- 22,26 ----
  import mailbox
  import random
+ import re
  from sets import Set
  import sys
***************
*** 28,31 ****
--- 29,33 ----
  from TestDriver import Driver
  from timtest import Msg
+ from Options import options
  
  mbox_fmts = {"unix": mailbox.PortableUnixMailbox,
***************
*** 37,53 ****
  class MyTokenizer(Tokenizer):
  
!     skip = {'received': 1,
!             'date': 1,
!             'x-from_': 1,
!             }
  
      def tokenize_headers(self, msg):
!         for k, v in msg.items():
!             k = k.lower()
!             if k in self.skip or k.startswith('x-vm'):
!                 continue
!             for w in subject_word_re.findall(v):
!                 for t in tokenize_word(w):
!                     yield "%s:%s" % (k, t)
  
  class MboxMsg(Msg):
--- 39,57 ----
  class MyTokenizer(Tokenizer):
  
!     skip = [re.compile(rx) for rx in options.skip_headers]
  
      def tokenize_headers(self, msg):
!         if options.tokenize_header_words:
!             for k, v in msg.items():
!                 k = k.lower()
!                 for rx in self.skip:
!                     if rx.match(k):
!                         continue
!                 for w in subject_word_re.findall(v):
!                     for t in tokenize_word(w):
!                         yield "%s:%s" % (k, t)
!         if options.tokenize_header_default:
!             for tok in Tokenizer.tokenize_headers(self, msg):
!                 yield tok
  
  class MboxMsg(Msg):
***************
*** 74,81 ****
          return "\n".join(lines)
  
! ##    tokenize = MyTokenizer().tokenize
  
      def __iter__(self):
!         return tokenize(self.guts)
  
  class mbox(object):
--- 78,85 ----
          return "\n".join(lines)
  
!     tokenize = MyTokenizer().tokenize
  
      def __iter__(self):
!         return self.tokenize(self.guts)
  
  class mbox(object):
***************
*** 130,134 ****
  
      FMT = "unix"
!     NSETS = 5
      SEED = 101
      MAXMSGS = None
--- 134,138 ----
  
      FMT = "unix"
!     NSETS = 10
      SEED = 101
      MAXMSGS = None
***************
*** 158,176 ****
      print "spam", spam, nspam
  
!     testsets = []
!     for iham in randindices(nham, NSETS):
!         for ispam in randindices(nspam, NSETS):
!             testsets.append((sort(iham), sort(ispam)))
  
      driver = Driver()
  
!     for iham, ispam in testsets:
!         driver.new_classifier()
!         driver.train(mbox(ham, iham), mbox(spam, ispam))
!         for ihtest, istest in testsets:
!             if (iham, ispam) == (ihtest, istest):
!                 continue
!             driver.test(mbox(ham, ihtest), mbox(spam, istest))
          driver.finishtest()
      driver.alldone()
  
--- 162,188 ----
      print "spam", spam, nspam
  
!     ihams = map(tuple, randindices(nham, NSETS))
!     ispams = map(tuple, randindices(nspam, NSETS))
  
      driver = Driver()
  
!     for i in range(1, NSETS):
!         driver.train(mbox(ham, ihams[i]), mbox(spam, ispams[i]))
! 
!     i = 0
!     for iham, ispam in zip(ihams, ispams):
!         hams = mbox(ham, iham)
!         spams = mbox(spam, ispam)
! 
!         if i > 0:
!             driver.untrain(hams, spams)
!             
!         driver.test(hams, spams)
          driver.finishtest()
+ 
+         if i < NSETS - 1:
+             driver.train(hams, spams)
+         
+         i += 1
      driver.alldone()