[Spambayes-checkins] spambayes msgs.py,NONE,1.1 README.txt,1.22,1.23
Tim Peters
tim_one@users.sourceforge.net
Mon, 23 Sep 2002 13:03:09 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv12319
Modified Files:
README.txt
Added Files:
msgs.py
Log Message:
Preparing to refactor my test drivers.
--- NEW FILE: msgs.py ---
import os
import random
HAMKEEP = None
SPAMKEEP = None
SEED = random.randrange(2000000000)
class Msg(object):
__slots__ = 'tag', 'guts'
def __init__(self, dir, name):
path = dir + "/" + name
self.tag = path
f = open(path, 'rb')
self.guts = f.read()
f.close()
def __iter__(self):
return tokenize(self.guts)
# Compare msgs by their paths; this is appropriate for sets of msgs.
def __hash__(self):
return hash(self.tag)
def __eq__(self, other):
return self.tag == other.tag
def __str__(self):
return self.guts
# The iterator yields a stream of Msg objects, taken from a list of directories.
class MsgStream(object):
__slots__ = 'tag', 'directories', 'keep'
def __init__(self, tag, directories, keep=None):
self.tag = tag
self.directories = directories
self.keep = keep
def __str__(self):
return self.tag
def produce(self):
if self.keep is None:
for directory in self.directories:
for fname in os.listdir(directory):
yield Msg(directory, fname)
return
# We only want part of the msgs. Shuffle each directory list, but
# in such a way that we'll get the same result each time this is
# called on the same directory list.
for directory in self.directories:
all = os.listdir(directory)
random.seed(hash(max(all)) ^ SEED) # reproducible across calls
random.shuffle(all)
del all[self.keep:]
all.sort() # seems to speed access on Win98!
for fname in all:
yield Msg(directory, fname)
def __iter__(self):
return self.produce()
class HamStream(MsgStream):
def __init__(self, tag, directories):
MsgStream.__init__(self, tag, directories, HAMKEEP)
class SpamStream(MsgStream):
def __init__(self, tag, directories):
MsgStream.__init__(self, tag, directories, SPAMKEEP)
def setparms(hamkeep, spamkeep, seed=None):
"""Set HAMKEEP and SPAMKEEP. If seed is not None, also set SEED."""
global HAMKEEP, SPAMKEEP, SEED
HAMKEEP, SPAMKEEP = hamkeep, spamkeep
if seed is not None:
SEED = seed
Index: README.txt
===================================================================
RCS file: /cvsroot/spambayes/spambayes/README.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -C2 -d -r1.22 -r1.23
*** README.txt 22 Sep 2002 04:59:54 -0000 1.22
--- README.txt 23 Sep 2002 20:03:06 -0000 1.23
***************
*** 60,63 ****
--- 60,67 ----
cmp.py below.
+ msgs.py
+ Some simple classes to wrap raw msgs, and to produce streams of
+ msgs. The test drivers use these.
+
Apps