[Spambayes-checkins] spambayes Options.py,1.15,1.16
mboxtest.py,1.5,1.6
Jeremy Hylton
jhylton@users.sourceforge.net
Tue, 17 Sep 2002 08:29:48 -0700
- Previous message: [Spambayes-checkins] spambayes runtest.sh,NONE,1.1
README.txt,1.18,1.19
- Next message: [Spambayes-checkins]
spambayes Options.py,1.16,1.17 mboxtest.py,1.6,1.7 tokenizer.py,1.22,1.23
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv29116
Modified Files:
Options.py mboxtest.py
Log Message:
Add three options for MboxTest.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** Options.py 15 Sep 2002 00:01:48 -0000 1.15
--- Options.py 17 Sep 2002 15:29:45 -0000 1.16
***************
*** 68,71 ****
--- 68,86 ----
mine_received_headers: False
+ [MboxTest]
+ # If tokenize_header_words is true, then the header values are
+ # tokenized using the default text tokenize. The words are tagged
+ # with "header:" where header is the name of the header.
+ tokenize_header_words: False
+ # If tokenize_header_default is True, use the base header tokenization
+ # logic described in the Tokenizer section.
+ tokenize_header_default: True
+
+ # skip_headers is a set of regular expressions describing headers that
+ # should not be tokenized if tokenize_header is True.
+ skip_headers: received
+ date
+ x-.*
+
[TestDriver]
# These control various displays in class TestDriver.Driver.
***************
*** 158,161 ****
--- 173,180 ----
'adjust_probs_by_evidence_mass': boolean_cracker,
},
+ 'MboxTest': {'tokenize_header_words': boolean_cracker,
+ 'tokenize_header_default': boolean_cracker,
+ 'skip_headers': ('get', lambda s: Set(s.split())),
+ },
}
Index: mboxtest.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/mboxtest.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** mboxtest.py 14 Sep 2002 00:03:51 -0000 1.5
--- mboxtest.py 17 Sep 2002 15:29:45 -0000 1.6
***************
*** 22,25 ****
--- 22,26 ----
import mailbox
import random
+ import re
from sets import Set
import sys
***************
*** 28,31 ****
--- 29,33 ----
from TestDriver import Driver
from timtest import Msg
+ from Options import options
mbox_fmts = {"unix": mailbox.PortableUnixMailbox,
***************
*** 37,53 ****
class MyTokenizer(Tokenizer):
! skip = {'received': 1,
! 'date': 1,
! 'x-from_': 1,
! }
def tokenize_headers(self, msg):
! for k, v in msg.items():
! k = k.lower()
! if k in self.skip or k.startswith('x-vm'):
! continue
! for w in subject_word_re.findall(v):
! for t in tokenize_word(w):
! yield "%s:%s" % (k, t)
class MboxMsg(Msg):
--- 39,57 ----
class MyTokenizer(Tokenizer):
! skip = [re.compile(rx) for rx in options.skip_headers]
def tokenize_headers(self, msg):
! if options.tokenize_header_words:
! for k, v in msg.items():
! k = k.lower()
! for rx in self.skip:
! if rx.match(k):
! continue
! for w in subject_word_re.findall(v):
! for t in tokenize_word(w):
! yield "%s:%s" % (k, t)
! if options.tokenize_header_default:
! for tok in Tokenizer.tokenize_headers(self, msg):
! yield tok
class MboxMsg(Msg):
***************
*** 74,81 ****
return "\n".join(lines)
! ## tokenize = MyTokenizer().tokenize
def __iter__(self):
! return tokenize(self.guts)
class mbox(object):
--- 78,85 ----
return "\n".join(lines)
! tokenize = MyTokenizer().tokenize
def __iter__(self):
! return self.tokenize(self.guts)
class mbox(object):
***************
*** 130,134 ****
FMT = "unix"
! NSETS = 5
SEED = 101
MAXMSGS = None
--- 134,138 ----
FMT = "unix"
! NSETS = 10
SEED = 101
MAXMSGS = None
***************
*** 158,176 ****
print "spam", spam, nspam
! testsets = []
! for iham in randindices(nham, NSETS):
! for ispam in randindices(nspam, NSETS):
! testsets.append((sort(iham), sort(ispam)))
driver = Driver()
! for iham, ispam in testsets:
! driver.new_classifier()
! driver.train(mbox(ham, iham), mbox(spam, ispam))
! for ihtest, istest in testsets:
! if (iham, ispam) == (ihtest, istest):
! continue
! driver.test(mbox(ham, ihtest), mbox(spam, istest))
driver.finishtest()
driver.alldone()
--- 162,188 ----
print "spam", spam, nspam
! ihams = map(tuple, randindices(nham, NSETS))
! ispams = map(tuple, randindices(nspam, NSETS))
driver = Driver()
! for i in range(1, NSETS):
! driver.train(mbox(ham, ihams[i]), mbox(spam, ispams[i]))
!
! i = 0
! for iham, ispam in zip(ihams, ispams):
! hams = mbox(ham, iham)
! spams = mbox(spam, ispam)
!
! if i > 0:
! driver.untrain(hams, spams)
!
! driver.test(hams, spams)
driver.finishtest()
+
+ if i < NSETS - 1:
+ driver.train(hams, spams)
+
+ i += 1
driver.alldone()
- Previous message: [Spambayes-checkins] spambayes runtest.sh,NONE,1.1
README.txt,1.18,1.19
- Next message: [Spambayes-checkins]
spambayes Options.py,1.16,1.17 mboxtest.py,1.6,1.7 tokenizer.py,1.22,1.23
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]