[Spambayes-checkins] spambayes FileCorpus.py,1.2,1.2.2.1
Tim Stone
timstone4@users.sourceforge.net
Fri Nov 22 00:31:21 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv23466
Modified Files:
Tag: hammie-playground
FileCorpus.py
Log Message:
Corrected some references to .substance instead of .getSubstance()
and .setSubstance()
Added tests for the header and body convenience methods that were
added to Message
Index: FileCorpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/FileCorpus.py,v
retrieving revision 1.2
retrieving revision 1.2.2.1
diff -C2 -d -r1.2 -r1.2.2.1
*** FileCorpus.py 16 Nov 2002 19:06:27 -0000 1.2
--- FileCorpus.py 22 Nov 2002 00:31:19 -0000 1.2.2.1
***************
*** 86,90 ****
import Corpus
! import Bayes
import sys, os, gzip, fnmatch, getopt, errno, time, stat
--- 86,90 ----
import Corpus
! import Persistent
import sys, os, gzip, fnmatch, getopt, errno, time, stat
***************
*** 192,195 ****
--- 192,196 ----
'''Constructor(message file name, corpus directory name)'''
+ Corpus.Message.__init__(self)
self.file_name = file_name
self.directory = directory
***************
*** 214,218 ****
raise
else:
! self.substance = fp.read()
fp.close()
--- 215,219 ----
raise
else:
! self.setSubstance(fp.read())
fp.close()
***************
*** 225,229 ****
pn = self.pathname()
fp = open(pn, 'wb')
! fp.write(self.substance)
fp.close()
--- 226,230 ----
pn = self.pathname()
fp = open(pn, 'wb')
! fp.write(self.getSubstance())
fp.close()
***************
*** 248,260 ****
elip = ''
! sub = self.substance
!
if Corpus.Verbose:
! sub = self.substance
else:
! if len(self.substance) > 20:
! sub = self.substance[:20]
! if len(self.substance) > 40:
! sub += '...' + self.substance[-20:]
pn = os.path.join(self.directory, self.file_name)
--- 249,261 ----
elip = ''
! sub = self.getSubstance()
!
if Corpus.Verbose:
! sub = self.getSubstance()
else:
! if len(sub) > 20:
! sub = sub[:20]
! if len(sub) > 40:
! sub += '...' + sub[-20:]
pn = os.path.join(self.directory, self.file_name)
***************
*** 304,308 ****
raise
else:
! self.substance = fp.read()
fp.close()
--- 305,309 ----
raise
else:
! self.setSubstance(fp.read())
fp.close()
***************
*** 316,320 ****
pn = self.pathname()
gz = gzip.open(pn, 'wb')
! gz.write(self.substance)
gz.flush()
gz.close()
--- 317,321 ----
pn = self.pathname()
gz = gzip.open(pn, 'wb')
! gz.write(self.getSubstance())
gz.flush()
gz.close()
***************
*** 342,354 ****
print 'Executing with uncompressed files'
! print '\n\nCreating two Bayes databases'
! miscbayes = Bayes.PickledBayes('fctestmisc.bayes')
! classbayes = Bayes.DBDictBayes('fctestclass.bayes')
print '\n\nSetting up spam corpus'
spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
! spamtrainer = Bayes.SpamTrainer(miscbayes)
spamcorpus.addObserver(spamtrainer)
! anotherspamtrainer = Bayes.SpamTrainer(classbayes, Bayes.UPDATEPROBS)
spamcorpus.addObserver(anotherspamtrainer)
--- 343,355 ----
print 'Executing with uncompressed files'
! print '\n\nCreating two Classifier databases'
! miscbayes = Persistent.PickledClassifier('fctestmisc.bayes')
! classbayes = Persistent.DBDictClassifier('fctestclass.bayes')
print '\n\nSetting up spam corpus'
spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
! spamtrainer = Persistent.SpamTrainer(miscbayes)
spamcorpus.addObserver(spamtrainer)
! anotherspamtrainer = Persistent.SpamTrainer(classbayes, Persistent.UPDATEPROBS)
spamcorpus.addObserver(anotherspamtrainer)
***************
*** 365,374 ****
'fctesthamcorpus', \
'MSG*')
! hamtrainer = Bayes.HamTrainer(miscbayes)
hamcorpus.addObserver(hamtrainer)
hamtrainer.trainAll(hamcorpus)
!
! print '\n\nAdd a message to hamcorpus that does not match the filter'
if useGzip:
fmClass = GzipFileMessage
--- 366,374 ----
'fctesthamcorpus', \
'MSG*')
! hamtrainer = Persistent.HamTrainer(miscbayes)
hamcorpus.addObserver(hamtrainer)
hamtrainer.trainAll(hamcorpus)
! print '\n\nA couple of message related tests'
if useGzip:
fmClass = GzipFileMessage
***************
*** 377,380 ****
--- 377,383 ----
m1 = fmClass('XMG00001', 'fctestspamcorpus')
+ m1.setSubstance(testmsg2())
+
+ print '\n\nAdd a message to hamcorpus that does not match the filter'
try:
***************
*** 417,421 ****
print '\n\nTrain with an individual message'
! anotherhamtrainer = Bayes.HamTrainer(classbayes)
anotherhamtrainer.train(unsurecorpus['MSG00005'])
--- 420,424 ----
print '\n\nTrain with an individual message'
! anotherhamtrainer = Persistent.HamTrainer(classbayes)
anotherhamtrainer.train(unsurecorpus['MSG00005'])
***************
*** 428,431 ****
--- 431,443 ----
msg = spamcorpus['MSG00001']
print msg
+ print '\n\nThis is some vital information in the message'
+ print 'Date header is',msg.getDate()
+ print 'Subject header is',msg.getSubject()
+ print 'From header is',msg.getFrom()
+
+ print 'Header text is:',msg.getHeaders()
+ print 'Headers are:',msg.getHeadersList()
+ print 'Body is:',msg.getBody()
+
***************
*** 526,538 ****
m1 = fmClass('MSG00001', 'fctestspamcorpus')
! m1.substance = tm1
m1.store()
m2 = fmClass('MSG00002', 'fctestspamcorpus')
! m2.substance = tm2
m2.store()
m3 = fmClass('MSG00003', 'fctestunsurecorpus')
! m3.substance = tm1
m3.store()
--- 538,550 ----
m1 = fmClass('MSG00001', 'fctestspamcorpus')
! m1.setSubstance(tm1)
m1.store()
m2 = fmClass('MSG00002', 'fctestspamcorpus')
! m2.setSubstance(tm2)
m2.store()
m3 = fmClass('MSG00003', 'fctestunsurecorpus')
! m3.setSubstance(tm1)
m3.store()
***************
*** 546,558 ****
m4 = fmClass('MSG00004', 'fctestunsurecorpus')
! m4.substance = tm1
m4.store()
m5 = fmClass('MSG00005', 'fctestunsurecorpus')
! m5.substance = tm2
m5.store()
m6 = fmClass('MSG00006', 'fctestunsurecorpus')
! m6.substance = tm2
m6.store()
--- 558,570 ----
m4 = fmClass('MSG00004', 'fctestunsurecorpus')
! m4.setSubstance(tm1)
m4.store()
m5 = fmClass('MSG00005', 'fctestunsurecorpus')
! m5.setSubstance(tm2)
m5.store()
m6 = fmClass('MSG00006', 'fctestunsurecorpus')
! m6.setSubstance(tm2)
m6.store()
***************
*** 583,587 ****
Content-Type:text/plain; charset=us-ascii
Content- Transfer- Encoding:7bit
-
Message-ID:<15814.42238.882013.702030@montanaro.dyndns.org>
Date:Mon, 4 Nov 2002 10:49:02 -0600
--- 595,598 ----
***************
*** 644,648 ****
Content-Type:text/plain; charset=us-ascii
Content- Transfer- Encoding:7bit
-
X-Hammie- Disposition:Unsure
--- 655,658 ----
More information about the Spambayes-checkins
mailing list