[Spambayes-checkins] spambayes mboxtest.py,1.1,1.2
Jeremy Hylton
jhylton@users.sourceforge.net
Sat, 07 Sep 2002 09:17:21 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv18055
Modified Files:
mboxtest.py
Log Message:
A bunch of unrelated updates.
Add docstring.
Use tokenizer module.
Add MyTokenizer that knows less about how to deal with headers.
Add custom __str__() to MboxMsg to surpress boring headers.
Index: mboxtest.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/mboxtest.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** mboxtest.py 6 Sep 2002 19:26:34 -0000 1.1
--- mboxtest.py 7 Sep 2002 16:17:19 -0000 1.2
***************
*** 1,5 ****
#! /usr/bin/env python
! from timtoken import tokenize
from classifier import GrahamBayes
from Tester import Test
--- 1,26 ----
#! /usr/bin/env python
+ """mboxtest.py: A test driver for classifier.
! Usage: mboxtest.py [options] <ham> <spam>
!
! Options:
! -f FMT
! One of unix, mmdf, mh, or qmail. Specifies mailbox format for
! ham and spam files. Default is unix.
!
! -n NSETS
! Number of test sets to create for a single mailbox. Default is 5.
!
! -s SEED
! Seed for random number generator. Default is 101.
!
! -m MSGS
! Read no more than MSGS messages from mailbox.
!
! -l LIMIT
! Print no more than LIMIT characters of a message in test output.
! """
!
! from tokenizer import Tokenizer, subject_word_re, tokenize_word, tokenize
from classifier import GrahamBayes
from Tester import Test
***************
*** 18,21 ****
--- 39,58 ----
}
+ class MyTokenizer(Tokenizer):
+
+ skip = {'received': 1,
+ 'date': 1,
+ 'x-from_': 1,
+ }
+
+ def tokenize_headers(self, msg):
+ for k, v in msg.items():
+ k = k.lower()
+ if k in self.skip or k.startswith('x-vm'):
+ continue
+ for w in subject_word_re.findall(v):
+ for t in tokenize_word(w):
+ yield "%s:%s" % (k, t)
+
class MboxMsg(Msg):
***************
*** 24,27 ****
--- 61,86 ----
self.tag = "%s:%s %s" % (path, index, subject(self.guts))
+ def __str__(self):
+ lines = []
+ i = 0
+ for line in self.guts.split("\n"):
+ skip = False
+ for skip_prefix in 'X-', 'Received:', '\t',:
+ if line.startswith(skip_prefix):
+ skip = True
+ if skip:
+ continue
+ i += 1
+ if i > 100:
+ lines.append("... truncated")
+ break
+ lines.append(line)
+ return "\n".join(lines)
+
+ ## tokenize = MyTokenizer().tokenize
+
+ def __iter__(self):
+ return tokenize(self.guts)
+
class mbox(object):
***************
*** 77,82 ****
NSETS = 5
SEED = 101
! LIMIT = None
! opts, args = getopt.getopt(args, "f:n:s:l:")
for k, v in opts:
if k == '-f':
--- 136,142 ----
NSETS = 5
SEED = 101
! MAXMSGS = None
! CHARLIMIT = 1000
! opts, args = getopt.getopt(args, "f:n:s:l:m:")
for k, v in opts:
if k == '-f':
***************
*** 87,91 ****
SEED = int(v)
if k == '-l':
! LIMIT = int(v)
ham, spam = args
--- 147,153 ----
SEED = int(v)
if k == '-l':
! CHARLIMIT = int(v)
! if k == '-m':
! MAXMSGS = int(v)
ham, spam = args
***************
*** 96,102 ****
nspam = len(list(mbox(spam)))
! if LIMIT:
! nham = min(nham, LIMIT)
! nspam = min(nspam, LIMIT)
print "ham", ham, nham
--- 158,164 ----
nspam = len(list(mbox(spam)))
! if MAXMSGS:
! nham = min(nham, MAXMSGS)
! nspam = min(nspam, MAXMSGS)
print "ham", ham, nham
***************
*** 115,120 ****
if (iham, ispam) == (ihtest, istest):
continue
! driver.test(mbox(ham, ihtest), mbox(spam, istest))
! driver.finish()
driver.alldone()
--- 177,182 ----
if (iham, ispam) == (ihtest, istest):
continue
! driver.test(mbox(ham, ihtest), mbox(spam, istest), CHARLIMIT)
! driver.finishtest()
driver.alldone()