[Spambayes-checkins] spambayes mboxtest.py,1.1,1.2

Jeremy Hylton jhylton@users.sourceforge.net
Sat, 07 Sep 2002 09:17:21 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv18055

Modified Files:
	mboxtest.py 
Log Message:
A bunch of unrelated updates.

Add docstring.
Use tokenizer module.
Add MyTokenizer that knows less about how to deal with headers.
Add custom __str__() to MboxMsg to surpress boring headers.


Index: mboxtest.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/mboxtest.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** mboxtest.py	6 Sep 2002 19:26:34 -0000	1.1
--- mboxtest.py	7 Sep 2002 16:17:19 -0000	1.2
***************
*** 1,5 ****
  #! /usr/bin/env python
  
! from timtoken import tokenize
  from classifier import GrahamBayes
  from Tester import Test
--- 1,26 ----
  #! /usr/bin/env python
+ """mboxtest.py: A test driver for classifier.
  
! Usage: mboxtest.py [options] <ham> <spam>
! 
! Options:
!     -f FMT
!         One of unix, mmdf, mh, or qmail.  Specifies mailbox format for
!         ham and spam files.  Default is unix.
!         
!     -n NSETS
!         Number of test sets to create for a single mailbox.  Default is 5.
! 
!     -s SEED
!         Seed for random number generator.  Default is 101.
! 
!     -m MSGS
!         Read no more than MSGS messages from mailbox.
! 
!     -l LIMIT
!         Print no more than LIMIT characters of a message in test output.
! """
! 
! from tokenizer import Tokenizer, subject_word_re, tokenize_word, tokenize
  from classifier import GrahamBayes
  from Tester import Test
***************
*** 18,21 ****
--- 39,58 ----
               }
  
+ class MyTokenizer(Tokenizer):
+ 
+     skip = {'received': 1,
+             'date': 1,
+             'x-from_': 1,
+             }
+ 
+     def tokenize_headers(self, msg):
+         for k, v in msg.items():
+             k = k.lower()
+             if k in self.skip or k.startswith('x-vm'):
+                 continue
+             for w in subject_word_re.findall(v):
+                 for t in tokenize_word(w):
+                     yield "%s:%s" % (k, t)
+ 
  class MboxMsg(Msg):
  
***************
*** 24,27 ****
--- 61,86 ----
          self.tag = "%s:%s %s" % (path, index, subject(self.guts))
  
+     def __str__(self):
+         lines = []
+         i = 0
+         for line in self.guts.split("\n"):
+             skip = False
+             for skip_prefix in 'X-', 'Received:', '\t',:
+                 if line.startswith(skip_prefix):
+                     skip = True
+             if skip:
+                 continue
+             i += 1
+             if i > 100:
+                 lines.append("... truncated")
+                 break
+             lines.append(line)
+         return "\n".join(lines)
+ 
+ ##    tokenize = MyTokenizer().tokenize
+ 
+     def __iter__(self):
+         return tokenize(self.guts)
+ 
  class mbox(object):
  
***************
*** 77,82 ****
      NSETS = 5
      SEED = 101
!     LIMIT = None
!     opts, args = getopt.getopt(args, "f:n:s:l:")
      for k, v in opts:
          if k == '-f':
--- 136,142 ----
      NSETS = 5
      SEED = 101
!     MAXMSGS = None
!     CHARLIMIT = 1000
!     opts, args = getopt.getopt(args, "f:n:s:l:m:")
      for k, v in opts:
          if k == '-f':
***************
*** 87,91 ****
              SEED = int(v)
          if k == '-l':
!             LIMIT = int(v)
  
      ham, spam = args
--- 147,153 ----
              SEED = int(v)
          if k == '-l':
!             CHARLIMIT = int(v)
!         if k == '-m':
!             MAXMSGS = int(v)
  
      ham, spam = args
***************
*** 96,102 ****
      nspam = len(list(mbox(spam)))
  
!     if LIMIT:
!         nham = min(nham, LIMIT)
!         nspam = min(nspam, LIMIT)
  
      print "ham", ham, nham
--- 158,164 ----
      nspam = len(list(mbox(spam)))
  
!     if MAXMSGS:
!         nham = min(nham, MAXMSGS)
!         nspam = min(nspam, MAXMSGS)
  
      print "ham", ham, nham
***************
*** 115,120 ****
              if (iham, ispam) == (ihtest, istest):
                  continue
!             driver.test(mbox(ham, ihtest), mbox(spam, istest))
!         driver.finish()
      driver.alldone()
  
--- 177,182 ----
              if (iham, ispam) == (ihtest, istest):
                  continue
!             driver.test(mbox(ham, ihtest), mbox(spam, istest), CHARLIMIT)
!         driver.finishtest()
      driver.alldone()