[Spambayes-checkins] spambayes msgs.py,NONE,1.1 README.txt,1.22,1.23

Tim Peters tim_one@users.sourceforge.net
Mon, 23 Sep 2002 13:03:09 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv12319

Modified Files:
	README.txt 
Added Files:
	msgs.py 
Log Message:
Preparing to refactor my test drivers.


--- NEW FILE: msgs.py ---
import os
import random

HAMKEEP  = None
SPAMKEEP = None
SEED = random.randrange(2000000000)

class Msg(object):
    __slots__ = 'tag', 'guts'

    def __init__(self, dir, name):
        path = dir + "/" + name
        self.tag = path
        f = open(path, 'rb')
        self.guts = f.read()
        f.close()

    def __iter__(self):
        return tokenize(self.guts)

    # Compare msgs by their paths; this is appropriate for sets of msgs.
    def __hash__(self):
        return hash(self.tag)

    def __eq__(self, other):
        return self.tag == other.tag

    def __str__(self):
        return self.guts

# The iterator yields a stream of Msg objects, taken from a list of directories.
class MsgStream(object):
    __slots__ = 'tag', 'directories', 'keep'

    def __init__(self, tag, directories, keep=None):
        self.tag = tag
        self.directories = directories
        self.keep = keep

    def __str__(self):
        return self.tag

    def produce(self):
        if self.keep is None:
            for directory in self.directories:
                for fname in os.listdir(directory):
                    yield Msg(directory, fname)
            return
        # We only want part of the msgs.  Shuffle each directory list, but
        # in such a way that we'll get the same result each time this is
        # called on the same directory list.
        for directory in self.directories:
            all = os.listdir(directory)
            random.seed(hash(max(all)) ^ SEED) # reproducible across calls
            random.shuffle(all)
            del all[self.keep:]
            all.sort()  # seems to speed access on Win98!
            for fname in all:
                yield Msg(directory, fname)

    def __iter__(self):
        return self.produce()

class HamStream(MsgStream):
    def __init__(self, tag, directories):
        MsgStream.__init__(self, tag, directories, HAMKEEP)

class SpamStream(MsgStream):
    def __init__(self, tag, directories):
        MsgStream.__init__(self, tag, directories, SPAMKEEP)

def setparms(hamkeep, spamkeep, seed=None):
    """Set HAMKEEP and SPAMKEEP.  If seed is not None, also set SEED."""

    global HAMKEEP, SPAMKEEP, SEED
    HAMKEEP, SPAMKEEP = hamkeep, spamkeep
    if seed is not None:
        SEED = seed
Index: README.txt
===================================================================
RCS file: /cvsroot/spambayes/spambayes/README.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -C2 -d -r1.22 -r1.23
*** README.txt	22 Sep 2002 04:59:54 -0000	1.22
--- README.txt	23 Sep 2002 20:03:06 -0000	1.23
***************
*** 60,63 ****
--- 60,67 ----
      cmp.py below.
  
+ msgs.py
+     Some simple classes to wrap raw msgs, and to produce streams of
+     msgs.  The test drivers use these.
+ 
  
  Apps