[Spambayes-checkins] spambayes hammie.py,1.2,1.3
Guido van Rossum
gvanrossum@users.sourceforge.net
Fri, 06 Sep 2002 13:23:18 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv5367
Modified Files:
hammie.py
Log Message:
Add a hack to train directly on a mailbox full of .txt files, like
Bruce Guenter's spam archive at http://www.em.ca/~bruceg/spam/.
Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** hammie.py 6 Sep 2002 20:12:05 -0000 1.2
--- hammie.py 6 Sep 2002 20:23:16 -0000 1.3
***************
*** 25,32 ****
--- 25,35 ----
"""
+ from __future__ import generators
+
import sys
import os
import getopt
import mailbox
+ import glob
import email
import classifier
***************
*** 158,161 ****
--- 161,182 ----
+ class DirOfTxtFileMailbox:
+
+ """Mailbox directory consisting of .txt files."""
+
+ def __init__(self, dirname, factory):
+ self.names = glob.glob(os.path.join(dirname, "*.txt"))
+ self.factory = factory
+
+ def __iter__(self):
+ for name in self.names:
+ try:
+ f = open(name)
+ except IOError:
+ continue
+ yield self.factory(f)
+ f.close()
+
+
def train(bayes, msgs, is_spam):
"""Train bayes with a message"""
***************
*** 167,171 ****
if os.path.isdir(msgs):
! mbox = mailbox.MHMailbox(msgs, _factory)
else:
fp = open(msgs)
--- 188,197 ----
if os.path.isdir(msgs):
! # XXX This is bogus: use an MHMailbox if the pathname contains /Mail/
! # XXX Should really use '+foo' MH folder styles. Later.
! if msgs.find("/Mail/") >= 0:
! mbox = mailbox.MHMailbox(msgs, _factory)
! else:
! mbox = DirOfTxtFileMailbox(msgs, _factory)
else:
fp = open(msgs)