[Spambayes-checkins] spambayes/Outlook2000 msgstore.py,NONE,1.1 addin.py,1.13,1.14 classify.py,1.8,1.9 filter.py,1.9,1.10 manager.py,1.17,1.18 rule.py,1.4,1.5 train.py,1.5,1.6

Mark Hammond mhammond@users.sourceforge.net
Thu, 24 Oct 2002 06:06:41 -0700


Update of /cvsroot/spambayes/spambayes/Outlook2000
In directory usw-pr-cvs1:/tmp/cvs-serv19862

Modified Files:
	addin.py classify.py filter.py manager.py rule.py train.py 
Added Files:
	msgstore.py 
Log Message:
Had a nice log message written - 3rd go at checkin in.

Use extended MAPI.  Much faster.  Nearly got rid of MAPI.Session.
Must sleep <snore>


--- NEW FILE: msgstore.py ---
from __future__ import generators

import sys, os


# Abstract definition - can be moved out when we have more than one sub-class <wink>
# External interface to this module is almost exclusively via a "folder ID"

class MsgStoreException(Exception):
    pass

class NotFoundException(MsgStoreException):
    pass

class MsgStore:
    # Stash exceptions in the class for ease of use by consumers.
    MsgStoreException = MsgStoreException
    NotFoundException = NotFoundException
    def __init__(self):
        pass
    def Close(self):
        # Close this object and free everything
        raise NotImplementedError
    def GetFolderGenerator(self, folder_ids, include_sub):
        # Return a generator of MsgStoreFolder objects.
        raise NotImplementedError
    def GetFolder(self, folder_id):
        # Return a single folder given the ID.
        raise NotImplementedError
    def GetMessage(self, message_id):
        # Return a single message given the ID.
        raise NotImplementedError

class MsgStoreFolder:
    def __init__(self):
        self.name = "<folder>"
        self.count = 0
    def GetMessageGenerator(self, folder):
        # Return a generator of MsgStoreMsg objects for the folder
        raise NotImplementedError

class MsgStoreMsg:
    def __init__(self):
        self.unread = False
    def GetEmailPackageObject(self):
        # Return a "read-only" Python email package object
        # "read-only" in that changes will never be reflected to the real store.
        raise NotImplementedError
    def SetField(self, name, value):
        # Abstractly set a user field name/id to a field value.
        # User field is for the user to see - status/internal fields
        # should get their own methods
        raise NotImplementedError
    def Save(self):
        # Save changes after field changes.
        raise NotImplementedError
    def MoveTo(self, folder_id):
        # Move the message to a folder.
        raise NotImplementedError
    def CopyTo(self, folder_id):
        # Copy the message to a folder.
        raise NotImplementedError
    # And some status ones we may hopefully use.
    def BeenFiltered(self):
        # Ever been filtered by us before
        raise NotImplementedError
    def GetTrainedCorpaName(self):
        # Return None, "ham" or "spam"
        raise NotImplementedError

        
# Our MAPI implementation
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, append=1)

from win32com.client import Dispatch, constants
from win32com.mapi import mapi
from win32com.mapi.mapitags import *

MESSAGE_MOVE = 0x1 # from MAPIdefs.h
MYPR_BODY_HTML_A = 0x1013001e # magic <wink>
MYPR_BODY_HTML_W = 0x1013001f # ditto

USE_DEFERRED_ERRORS = mapi.MAPI_DEFERRED_ERRORS # or set to zero to see what changes <wink>

class MAPIMsgStore(MsgStore):
    def __init__(self, outlook = None):
        self.outlook = outlook
        cwd = os.getcwd()
        mapi.MAPIInitialize(None)
        logonFlags = mapi.MAPI_NO_MAIL | mapi.MAPI_EXTENDED | mapi.MAPI_USE_DEFAULT 
        self.session = mapi.MAPILogonEx(0, None, None, logonFlags)
        self._FindDefaultMessageStore()
        os.chdir(cwd)

    def Close(self):
        self.mapi_msgstore = None
        self.session.Logoff(0,0,0)
        self.session = None
        mapi.MAPIUninitialize()

    def _FindDefaultMessageStore(self):
        tab = self.session.GetMsgStoresTable(0)
        # restriction for the table.
        restriction = mapi.RES_PROPERTY, (mapi.RELOP_EQ, PR_DEFAULT_STORE, (PR_DEFAULT_STORE, True))
        rows = mapi.HrQueryAllRows(tab, (PR_ENTRYID,), restriction, None, 0)
        # get first entry
        row = rows[0]
        eid_tag, eid = row[0]
        # Open the store.
        self.mapi_msgstore = self.session.OpenMsgStore(0, eid, None, mapi.MDB_WRITE | mapi.MDB_NO_MAIL | USE_DEFERRED_ERRORS )

    def _GetSubFolderIter(self, folder):
        table = folder.GetHierarchyTable(0)
        rows = mapi.HrQueryAllRows(table, (PR_ENTRYID,PR_DISPLAY_NAME_A), None, None, 0)
        for (eid_tag, eid),(name_tag, name) in rows:
            sub = self.mapi_msgstore.OpenEntry(eid, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
            table = sub.GetContentsTable(0)
            yield MAPIMsgStoreFolder(self, eid, name, table.GetRowCount(0))
            folder = self.mapi_msgstore.OpenEntry(eid, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
            for store_folder in self._GetSubFolderIter(folder):
                yield store_folder
        
    def GetFolderGenerator(self, folder_ids, include_sub):
        for folder_id in folder_ids:
            folder_id = mapi.BinFromHex(folder_id)
            folder = self.mapi_msgstore.OpenEntry(folder_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
            table = folder.GetContentsTable(0)
            rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0)
            yield MAPIMsgStoreFolder(self, folder_id, props[0][1], table.GetRowCount(0))
            if include_sub:
                for f in self._GetSubFolderIter(folder):
                    yield f

    def GetFolder(self, folder_id):
        # Return a single folder given the ID.
        folder_id = mapi.BinFromHex(folder_id)
        folder = self.mapi_msgstore.OpenEntry(folder_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
        table = folder.GetContentsTable(0)
        rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0)
        return MAPIMsgStoreFolder(self, folder_id, props[0][1], table.GetRowCount(0))

    def GetMessage(self, message_id):
        # Return a single message given the ID.
        message_id = mapi.BinFromHex(message_id)
        prop_ids = PR_PARENT_ENTRYID, PR_CONTENT_UNREAD
        mapi_object = self.mapi_msgstore.OpenEntry(message_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
        hr, data = mapi_object.GetProps(prop_ids,0)
        folder_eid = data[0][1]
        unread = data[1][1]
        folder = MAPIMsgStoreFolder(self, folder_eid, "Unknown - temp message", -1)
        return  MAPIMsgStoreMsg(self, folder, message_id, unread)        

    def GetOutlookObjectFromID(self, eid):
        if self.outlook is None:
            from win32com.client import Dispatch
            self.outlook = Dispatch("Outlook.Application")
        return self.outlook.Session.GetItemFromID(mapi.HexFromBin(eid))


_MapiTypeMap = {
    type(0.0): PT_DOUBLE,
    type(0): PT_I4,
    type(''): PT_STRING8,
    type(u''): PT_UNICODE,
    type(1==1): PT_BOOLEAN,
}

class MAPIMsgStoreFolder(MsgStoreMsg):
    def __init__(self, msgstore, id, name, count):
        self.msgstore = msgstore
        self.id = id
        self.name = name
        self.count = count

    def __repr__(self):
        return "<%s '%s' (%d items), id=%s>" % (self.__class__.__name__, self.name, self.count, mapi.HexFromBin(self.id))

    def GetOutlookEntryID(self):
        return mapi.HexFromBin(self.id)

    def GetMessageGenerator(self):
        folder = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
        table = folder.GetContentsTable(0)
        prop_ids = PR_ENTRYID, PR_CONTENT_UNREAD
        table.SetColumns(prop_ids, 0)
        while 1:
            # Getting 70 at a time was the random number that gave best perf for me ;)
            rows = table.QueryRows(70, 0)
            if len(rows)==0:
                break
            for row in rows:
                yield MAPIMsgStoreMsg(self.msgstore, self, row[0][1], row[1][1])


class MAPIMsgStoreMsg(MsgStoreMsg):
    def __init__(self, msgstore, folder, entryid, unread):
        self.folder = folder
        self.msgstore = msgstore
        self.mapi_object = None
        self.id = entryid
        self.unread = unread
        self.dirty = False

    def __repr__(self):
        urs = ["read", "unread"][self.unread]
        return "<%s, (%s) id=%s>" % (self.__class__.__name__, urs, mapi.HexFromBin(self.id))

    def GetOutlookEntryID(self):
        return mapi.HexFromBin(self.id)

    def _GetMessageText(self):
        self._EnsureObject()
        prop_ids = PR_TRANSPORT_MESSAGE_HEADERS_A, PR_BODY, MYPR_BODY_HTML_A
        hr, data = self.mapi_object.GetProps(prop_ids,0)
        headers = data[0][1]
        if type(headers) != type(''): headers = '' # If no field will be an int error (the tag([0]) would tell us, but this is easier)
        body = data[1][1]
        if type(body) != type(''): body= '' # If no field will be an int error (the tag([0]) would tell us, but this is easier)
        # Messages with "text/html" and "multipart/*" give grief.
        # In some cases, the HTML body appears *only* accessible via Outlook :(  Outlook is slow, so try and avoid
        # Tried using the "_W" props, and indeed tried dumping every prop - these HTML messages are hidden from Mapi!
        if PROP_TYPE(data[2][0])==PT_ERROR:
            # No HTML body - see if one of our problem children.
            html = ""
            lo_headers = headers.lower()
            if lo_headers.find("content-type: text/html")>=0 or lo_headers.find("content-type: multipart/")>=0:
                outlook_msg = self.msgstore.GetOutlookObjectFromID(self.id)
                html = outlook_msg.HTMLBody.encode("ascii", "replace")
        else:
            html = data[2][1]
        return headers + "\n" + html + "\n" + body

    def _EnsureObject(self):
        if self.mapi_object is None:
            self.mapi_object = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)

    def GetEmailPackageObject(self):
        import email
        # XXX If this was originally a MIME msg, we're hosed at this point --
        # the boundary tag in the headers doesn't exist in the body, and
        # the msg is simply ill-formed.  The miserable hack here simply
        # squashes the text part (if any) and the HTML part (if any) together,
        # and strips MIME info from the original headers.
        text = self._GetMessageText()
        try:
            msg = email.message_from_string(text)
        except:
            print "FAILED to create email.message from: ", `text`
            raise
        if msg.has_key('content-type'):
            del msg['content-type']
        if msg.has_key('content-transfer-encoding'):
            del msg['content-transfer-encoding']
        return msg

    def SetField(self, prop, val):
        self._EnsureObject()
        if type(prop)!=type(0):
            props = ( (mapi.PS_PUBLIC_STRINGS, prop), )
            propIds = self.mapi_object.GetIDsFromNames(props, mapi.MAPI_CREATE)
            type_tag = _MapiTypeMap.get(type(val))
            if type_tag is None:
                raise ValueError, "Dont know what to do with '%r' ('%s')" % (val, type(val))
            prop = PROP_TAG( type_tag, PROP_ID(propIds[0]))
        if val is None:
            # Delete the property
            self.mapi_object.DeleteProps((prop,))
        else:
            self.mapi_object.SetProps(((prop,val),))
        self.dirty = True

    def Save(self):
        assert self.dirty, "asking me to save a clean message!"
        self.mapi_object.SaveChanges(mapi.KEEP_OPEN_READWRITE)
        self.dirty = False

    def _DoCopyMode(self, folder, isMove):
##        self.mapi_object = None # release the COM pointer
        assert not self.dirty, "asking me to move a dirty message - later saves will fail!"
        dest_folder = self.msgstore.mapi_msgstore.OpenEntry(folder.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
        source_folder = self.msgstore.mapi_msgstore.OpenEntry(self.folder.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
        flags = 0
        if isMove: flags |= MESSAGE_MOVE
        source_folder.CopyMessages( (self.id,), None, dest_folder, 0, None, flags)
        self.folder = self.msgstore.GetFolder(mapi.HexFromBin(folder.id))
        
    def MoveTo(self, folder):
        self._DoCopyMode(folder, True)

    def CopyTo(self, folder):
        self._DoCopyMode(folder, True)

def test():
    from win32com.client import Dispatch
    outlook = Dispatch("Outlook.Application")
    eid = outlook.Session.GetDefaultFolder(constants.olFolderInbox).EntryID

    store = MAPIMsgStore()
    for folder in store.GetFolderGenerator([eid,], True):
        print folder
        for msg in folder.GetMessageGenerator():
            print msg
    store.Close()
    

if __name__=='__main__':
    test()
    
Index: addin.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** addin.py	24 Oct 2002 04:22:19 -0000	1.13
--- addin.py	24 Oct 2002 13:06:39 -0000	1.14
***************
*** 1,3 ****
! # Mark's Outlook addin
  
  import sys
--- 1,3 ----
! # SpamBayes Outlook Addin
  
  import sys
***************
*** 117,123 ****
      def OnItemAdd(self, item):
          if self.manager.config.filter.enabled:
!             mapi_message = self.manager.mapi.GetMessage(item.EntryID)
              import filter
!             num_rules = filter.filter_message(mapi_message, self.manager)
              print "%d Spam rules fired for message '%s'" \
                    % (num_rules, item.Subject.encode("ascii", "replace"))
--- 117,123 ----
      def OnItemAdd(self, item):
          if self.manager.config.filter.enabled:
!             msgstore_message = self.manager.message_store.GetMessage(item.EntryID)
              import filter
!             num_rules = filter.filter_message(msgstore_message, self.manager)
              print "%d Spam rules fired for message '%s'" \
                    % (num_rules, item.Subject.encode("ascii", "replace"))
***************
*** 142,148 ****
          return
  
!     mapi_message = mgr.mapi.GetMessage(item.EntryID)
!     stream = mgr.GetBayesStreamForMessage(mapi_message)
!     prob, clues = mgr.score(stream, evidence=True)
      new_msg = app.CreateItem(0)
      body = ["<h2>Spam Score: %g</h2><br>" % prob]
--- 142,147 ----
          return
  
!     msgstore_message = mgr.message_store.GetMessage(item.EntryID)
!     prob, clues = mgr.score(msgstore_message, evidence=True)
      new_msg = app.CreateItem(0)
      body = ["<h2>Spam Score: %g</h2><br>" % prob]
***************
*** 155,158 ****
--- 154,164 ----
          push(' %g\n' % prob)
      push("</PRE>\n")
+     # Now the raw text of the message, as best we can
+     push("<h2>Message Stream:</h2><br>")
+     push("<PRE>\n")
+     txt = msgstore_message.GetEmailPackageObject().as_string(unixfrom=1)
+     import cgi
+     push(cgi.escape(txt, True))
+     push("</PRE>\n")
      body = ''.join(body)
  
***************
*** 184,188 ****
          # Create our bayes manager
          import manager
!         self.manager = manager.GetManager()
          assert self.manager.addin is None, "Should not already have an addin"
          self.manager.addin = self
--- 190,194 ----
          # Create our bayes manager
          import manager
!         self.manager = manager.GetManager(application)
          assert self.manager.addin is None, "Should not already have an addin"
          self.manager.addin = self
***************
*** 229,239 ****
      def UpdateFolderHooks(self):
          new_hooks = {}
!         for mapi_folder in self.manager.BuildFolderList(
                      self.manager.config.filter.folder_ids,
                      self.manager.config.filter.include_sub):
!             eid = mapi_folder.ID
              existing = self.folder_hooks.get(eid)
              if existing is None:
!                 folder = self.application.GetNamespace("MAPI").GetFolderFromID(eid)
                  try:
                      new_hook = DispatchWithEvents(folder.Items, FolderItemsEvent)
--- 235,245 ----
      def UpdateFolderHooks(self):
          new_hooks = {}
!         for msgstore_folder in self.manager.message_store.GetFolderGenerator(
                      self.manager.config.filter.folder_ids,
                      self.manager.config.filter.include_sub):
!             eid = msgstore_folder.GetOutlookEntryID()
              existing = self.folder_hooks.get(eid)
              if existing is None:
!                 folder = self.application.Session.GetFolderFromID(eid)
                  try:
                      new_hook = DispatchWithEvents(folder.Items, FolderItemsEvent)

Index: classify.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** classify.py	24 Oct 2002 04:22:19 -0000	1.8
--- classify.py	24 Oct 2002 13:06:39 -0000	1.9
***************
*** 1,4 ****
! # Train a classifier from Outlook Mail folders
! # Author: Sean D. True, WebReply.Com
  # October, 2002
  # Copyright PSF, license under the PSF license
--- 1,4 ----
! # Classify a folder with a field
! # Authors: Sean D. True, WebReply.Com, Mark Hammond.
  # October, 2002
  # Copyright PSF, license under the PSF license
***************
*** 10,52 ****
  
  def classify_folder( f, mgr, config, progress):
!     messages = f.Messages
!     pythoncom.CoInitialize() # We are called on a different thread.
!     # We must get outlook in this thread - can't use the main thread :(
!     outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI")
! 
!     if not messages:
!         progress.warning("Can't find messages in folder '%s'" % (f.Name,))
!         return
!     message = messages.GetFirst()
!     while not progress.stop_requested() and message:
!         try:
!             progress.tick()
!             stream = mgr.GetBayesStreamForMessage(message)
!             prob = mgr.score(stream)
!             added_prop = False
!             try:
!                 if outlook_ns is not None:
!                     outlookItem = outlook_ns.GetItemFromID(message.ID)
!                     format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number.
!                     prop = outlookItem.UserProperties.Add(config.field_name, constants.olNumber, True, format)
!                     prop.Value = prob
!                     outlookItem.Save()
!                     added_prop = True
!             except "foo": # pythoncom.com_error, d:
!                 # Hrm - don't seem able to use outlook - use MAPI - but this
!                 # means the field doesn't automatically appear in the outlook "Field Chooser"
!                 # Tried explicity adding the field to the folder but still no go.
!                 added_prop = False
!             if not  added_prop:
!                 message.Fields.Add(config.field_name, 5, prob)
! 
!             message.Update()
!         except pythoncom.com_error, d:
!             progress.warning("Failed to get a message: %s" % (str(d),) )
!         message = messages.GetNext()
  
  # Called back from the dialog to do the actual training.
  def classifier(mgr, progress):
-     session = mgr.mapi
      config = mgr.config.classify
      if not config.folder_ids:
--- 10,23 ----
  
  def classify_folder( f, mgr, config, progress):
!     for message in f.GetMessageGenerator():
!         if progress.stop_requested():
!             break
!         progress.tick()
!         prob = mgr.score(message)
!         message.SetField(config.field_name, prob)
!         message.Save()
  
  # Called back from the dialog to do the actual training.
  def classifier(mgr, progress):
      config = mgr.config.classify
      if not config.folder_ids:
***************
*** 54,68 ****
          return
      progress.set_status("Counting messages")
-     folders = mgr.BuildFolderList(config.folder_ids, config.include_sub)
      num_msgs = 0
!     for f in folders:
!         num_msgs += f.Messages.Count + 1
      progress.set_max_ticks(num_msgs+3)
  
!     for f in folders:
!         progress.set_status("Processing folder '%s'" % (f.Name.encode("ascii", "replace"),))
          classify_folder(f, mgr, config, progress)
          if progress.stop_requested():
              return
  
  
--- 25,39 ----
          return
      progress.set_status("Counting messages")
      num_msgs = 0
!     for f in mgr.message_store.GetFolderGenerator(config.folder_ids, config.include_sub):
!         num_msgs += f.count
      progress.set_max_ticks(num_msgs+3)
  
!     for f in mgr.message_store.GetFolderGenerator(config.folder_ids, config.include_sub):
!         progress.set_status("Processing folder '%s'" % (f.name,))
          classify_folder(f, mgr, config, progress)
          if progress.stop_requested():
              return
+     progress.set_status("Classified %d messages." % (num_msgs,))
  
  

Index: filter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** filter.py	24 Oct 2002 04:22:19 -0000	1.9
--- filter.py	24 Oct 2002 13:06:39 -0000	1.10
***************
*** 5,20 ****
  
  import sys, os
- from win32com.client import Dispatch, constants
- import pythoncom
  import rule
  
  def filter_message(message, mgr):
!     try:
!         stream = mgr.GetBayesStreamForMessage(message)
!     except pythoncom.com_error, d:
!         print "Failed to get a message: %s" % (d,)
!         return
! 
!     prob = mgr.score(stream)
      num_rules = 0
      for rule in mgr.config.rules:
--- 5,12 ----
  
  import sys, os
  import rule
  
  def filter_message(message, mgr):
!     prob = mgr.score(message)
      num_rules = 0
      for rule in mgr.config.rules:
***************
*** 32,40 ****
      only_unread = filter.only_unread
      num_messages = 0
!     for message in mgr.YieldMessageList(f):
          if progress.stop_requested():
              break
          progress.tick()
!         if only_unread and not message.Unread:
              continue
          if filter_message(message, mgr):
--- 24,32 ----
      only_unread = filter.only_unread
      num_messages = 0
!     for message in f.GetMessageGenerator():
          if progress.stop_requested():
              break
          progress.tick()
!         if only_unread and not message.unread:
              continue
          if filter_message(message, mgr):
***************
*** 50,61 ****
  
      progress.set_status("Counting messages")
-     folders = mgr.BuildFolderList(filter.folder_ids, filter.include_sub)
      num_msgs = 0
!     for f in folders:
!         num_msgs += f.Messages.Count + 1
      progress.set_max_ticks(num_msgs+3)
      num = 0
!     for f in folders:
!         progress.set_status("Filtering folder '%s'" % (f.Name.encode("ascii", "replace"),))
          num += filter_folder(f, mgr, progress, filter)
          if progress.stop_requested():
--- 42,52 ----
  
      progress.set_status("Counting messages")
      num_msgs = 0
!     for f in mgr.message_store.GetFolderGenerator(filter.folder_ids, filter.include_sub):
!         num_msgs += f.count
      progress.set_max_ticks(num_msgs+3)
      num = 0
!     for f in mgr.message_store.GetFolderGenerator(filter.folder_ids, filter.include_sub):
!         progress.set_status("Filtering folder '%s'" % (f.name))
          num += filter_folder(f, mgr, progress, filter)
          if progress.stop_requested():

Index: manager.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v
retrieving revision 1.17
retrieving revision 1.18
diff -C2 -d -r1.17 -r1.18
*** manager.py	24 Oct 2002 04:58:52 -0000	1.17
--- manager.py	24 Oct 2002 13:06:39 -0000	1.18
***************
*** 11,14 ****
--- 11,15 ----
  
  import config
+ import msgstore
  
  try:
***************
*** 64,68 ****
          self.mapi = win32com.client.Dispatch("MAPI.Session")
          self.mapi.Logon(None, None, False, False)
-         self._tls = {thread.get_ident(): {"outlook": outlook} }
          self.outlook = outlook
          os.chdir(cwd)
--- 65,68 ----
***************
*** 70,119 ****
          import_core_spambayes_stuff(self.ini_filename)
          self.LoadBayes()
  
      # Outlook gives us thread grief :(
      def WorkerThreadStarting(self):
          pythoncom.CoInitialize()
-         self._tls[thread.get_ident()] = {}
  
      def WorkerThreadEnding(self):
-         assert self._tls.has_key(thread.get_ident()), \
-                "WorkerThreadStarting hasn't been called for this thread"
-         del self._tls[thread.get_ident()]
          pythoncom.CoUninitialize()
  
-     def GetOutlookForCurrentThread(self):
-         assert self._tls.has_key(thread.get_ident()), \
-                "WorkerThreadStarting hasn't been called for this thread"
-         existing = self._tls[thread.get_ident()].get("outlook")
-         if not existing:
-             existing = win32com.client.Dispatch("Outlook.Application")
-             self._tls[thread.get_ident()]["outlook"] = existing
-         return existing
- 
-     def GetBayesStreamForMessage(self, message):
-         # Note - caller must catch COM error
-         import email
- 
-         headers = message.Fields[0x7D001E].Value
-         headers = headers.encode('ascii', 'replace')
-         try:
-             body = message.Fields[0x1013001E].Value # HTMLBody field
-             body = body.encode("ascii", "replace") + "\n"
-         except pythoncom.error:
-             body = ""
-         body += message.Text.encode("ascii", "replace")
- 
-         # XXX If this was originally a MIME msg, we're hosed at this point --
-         # the boundary tag in the headers doesn't exist in the body, and
-         # the msg is simply ill-formed.  The miserable hack here simply
-         # squashes the text part (if any) and the HTML part (if any) together,
-         # and strips MIME info from the original headers.
-         msg = email.message_from_string(headers + '\n' + body)
-         if msg.has_key('content-type'):
-             del msg['content-type']
-         if msg.has_key('content-transfer-encoding'):
-             del msg['content-transfer-encoding']
-         return msg
- 
      def LoadBayes(self):
          if not os.path.exists(self.ini_filename):
--- 70,82 ----
          import_core_spambayes_stuff(self.ini_filename)
          self.LoadBayes()
+         self.message_store = msgstore.MAPIMsgStore(outlook)
  
      # Outlook gives us thread grief :(
      def WorkerThreadStarting(self):
          pythoncom.CoInitialize()
  
      def WorkerThreadEnding(self):
          pythoncom.CoUninitialize()
  
      def LoadBayes(self):
          if not os.path.exists(self.ini_filename):
***************
*** 123,126 ****
--- 86,92 ----
          bayes = None
          try:
+             # Ooops - Tim did it another way - checking this in before I get more conficts!
+ ##            from Options import options
+ ##            options.mergefiles([self.ini_filename])
              bayes = cPickle.load(open(self.bayes_filename, 'rb'))
              print "Loaded bayes database from '%s'" % (self.bayes_filename,)
***************
*** 169,172 ****
--- 135,141 ----
  
      def InitNewBayes(self):
+             # Ooops - Tim did it another way - checking this in before I get more conficts!
+ ##        from Options import options
+ ##        options.mergefiles([self.ini_filename])
          self.bayes = bayes_classifier.Bayes()
          self.bayes_dirty = True
***************
*** 203,246 ****
          self.bayes = None
          self.config = None
!         self._tls = None
! 
!     def BuildFolderList(self, folder_ids, include_sub):
!         ret = {}
!         for id in folder_ids:
!             subs = []
!             try:
!                 f = self.mapi.GetFolder(id)
!                 if include_sub:
!                     sub_ids = []
!                     subs = f.Folders
!                     for i in range(1, subs.Count):
!                         sub_ids.append(subs.Item(i).ID)
!                     subs = self.BuildFolderList(sub_ids, True)
!             except pythoncom.error:
!                 continue
!             ret[id] = f
!             for sub in subs:
!                 ret[sub.ID] = sub
!         return ret.values()
! 
!     def YieldMessageList(self, folder):
!         messages = folder.Messages
!         if not messages:
!             print "Can't find messages in folder '%s'" % (folder.Name,)
!             return
!         message = messages.GetFirst()
!         while message is not None:
!             yield message
!             message = messages.GetNext()
  
      def score(self, msg, evidence=False):
          return self.bayes.spamprob(bayes_tokenize(msg), evidence)
  
  _mgr = None
  
! def GetManager(verbose=1):
      global _mgr
      if _mgr is None:
!         _mgr = BayesManager(verbose=verbose)
      # If requesting greater verbosity, honour it
      if verbose > _mgr.verbose:
--- 172,192 ----
          self.bayes = None
          self.config = None
!         if self.message_store is not None:
!             self.message_store.Close()
!             self.message_store = None
  
      def score(self, msg, evidence=False):
+         email = msg.GetEmailPackageObject()
+         # As Tim suggested in email, score should move to range(100)
+         # This is probably a good place to do it - anyone who wants the real
+         # float value can look at the "clues"
          return self.bayes.spamprob(bayes_tokenize(msg), evidence)
  
  _mgr = None
  
! def GetManager(outlook = None, verbose=1):
      global _mgr
      if _mgr is None:
!         _mgr = BayesManager(outlook=outlook, verbose=verbose)
      # If requesting greater verbosity, honour it
      if verbose > _mgr.verbose:

Index: rule.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/rule.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** rule.py	20 Oct 2002 18:53:06 -0000	1.4
--- rule.py	24 Oct 2002 13:06:39 -0000	1.5
***************
*** 3,8 ****
  import time
  
- MAPI_E_NOT_FOUND = -2147221233
- 
  class Rule:
      def __init__(self):
--- 3,6 ----
***************
*** 14,18 ****
          self.flag_message = True
          self.write_field = True
!         self.write_field_name = "SpamProb"
          self.folder_id = ""
  
--- 12,16 ----
          self.flag_message = True
          self.write_field = True
!         self.write_field_name = "SpamScore"
          self.folder_id = ""
  
***************
*** 34,79 ****
              return "You must specify the field name to create"
  
-     def _GetFolder(self, mgr):
-         try:
-             return mgr.mapi.GetFolder(self.folder_id)
-         except pythoncom.com_error:
-             return None
- 
      def Act(self, mgr, msg, prob):
          if mgr.verbose > 1:
!             print "Rule '%s': %.2f->%.2f (%.2f) (%s)" % (self.name, self.min, self.max, prob, msg.Subject[:20].encode("ascii", "replace"))
          if prob < self.min or prob > self.max:
              return False
-         # Do mods before we move.
-         dirty = False
-         outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI")
-         try:
-             outlook_message = outlook_ns.GetItemFromID(msg.ID)
-         except pythoncom.com_error, (hr, desc, exc, arg):
-             if not exc or exc[5] != MAPI_E_NOT_FOUND:
-                 raise
-             print "Warning: Can't open the message - it has probably been moved"
-             return False
  
!         if self.flag_message:
!             outlook_message.FlagRequest = "Check Spam"
!             outlook_message.FlagStatus = constants.olFlagMarked
!             dirty = True
          if self.write_field:
!             format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number.
!             prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format)
!             prop.Value = prob
!             dirty = True
!         if dirty:
!             outlook_message.Save()
  
          if self.action == "None":
              pass
          elif self.action == "Copy":
!             outlook_message.Copy(outlook_ns.GetFolderFromID(self.folder_id))
          elif self.action == "Move":
!             outlook_message.Move(outlook_ns.GetFolderFromID(self.folder_id))
          else:
!             print "Eeek - bad action", self.action
  
          return True
--- 32,60 ----
              return "You must specify the field name to create"
  
      def Act(self, mgr, msg, prob):
          if mgr.verbose > 1:
!             print "Rule '%s': %.2f->%.2f (%.2f) (%s)" % (self.name, self.min, self.max, prob, repr(msg))
          if prob < self.min or prob > self.max:
              return False
  
! ##        if self.flag_message:
! ##            outlook_message.FlagRequest = "Check Spam"
! ##            outlook_message.FlagStatus = constants.olFlagMarked
! ##            dirty = True
! 
          if self.write_field:
!             msg.SetField(self.write_field_name, prob)
!             msg.Save()
  
          if self.action == "None":
              pass
          elif self.action == "Copy":
!             dest_folder = mgr.message_store.GetFolder(self.folder_id)
!             msg.CopyTo(dest_folder)
          elif self.action == "Move":
!             dest_folder = mgr.message_store.GetFolder(self.folder_id)
!             msg.MoveTo(dest_folder)
          else:
!             assert 0, "Eeek - bad action '%r'" % (self.action,)
  
          return True

Index: train.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** train.py	20 Oct 2002 23:51:04 -0000	1.5
--- train.py	24 Oct 2002 13:06:39 -0000	1.6
***************
*** 1,38 ****
  # Train a classifier from Outlook Mail folders
! # Author: Sean D. True, WebReply.Com, Mark Hammond
  # October, 2002
  # Copyright PSF, license under the PSF license
  
! import sys, os, os.path, getopt, cPickle, string
! import win32com.client
! import pythoncom
! import win32con
  
  def train_folder( f, isspam, mgr, progress):
      from tokenizer import tokenize
      num = 0
!     for message in mgr.YieldMessageList(f):
          if progress.stop_requested():
              break
          progress.tick()
!         try:
!             # work with MAPI until we work out how to get headers from outlook
!             message = mgr.mapi.GetMessage(message.ID)
!             stream = mgr.GetBayesStreamForMessage(message)
!         except pythoncom.com_error, d:
!             progress.warning("failed to get a message")
!             print "Failed to get a message", d
!             continue
          mgr.bayes.learn(tokenize(stream), isspam, False)
          num += 1
!     print "Trained over", num, "in folder", f.Name
  
  # Called back from the dialog to do the actual training.
  def trainer(mgr, progress):
-     pythoncom.CoInitialize()
      config = mgr.config
      mgr.InitNewBayes()
      bayes = mgr.bayes
-     session = mgr.mapi
  
      if not config.training.ham_folder_ids or not config.training.spam_folder_ids:
--- 1,26 ----
  # Train a classifier from Outlook Mail folders
! # Authors: Sean D. True, WebReply.Com, Mark Hammond
  # October, 2002
  # Copyright PSF, license under the PSF license
  
! import sys, os
  
  def train_folder( f, isspam, mgr, progress):
      from tokenizer import tokenize
      num = 0
!     for message in f.GetMessageGenerator():
          if progress.stop_requested():
              break
          progress.tick()
!         stream = message.GetEmailPackageObject()
          mgr.bayes.learn(tokenize(stream), isspam, False)
          num += 1
!     print "Trained over", num, "in folder", f.name
  
  # Called back from the dialog to do the actual training.
  def trainer(mgr, progress):
      config = mgr.config
      mgr.InitNewBayes()
      bayes = mgr.bayes
  
      if not config.training.ham_folder_ids or not config.training.spam_folder_ids:
***************
*** 40,58 ****
          return
      progress.set_status("Counting messages")
!     ham_folders = mgr.BuildFolderList(config.training.ham_folder_ids, config.training.ham_include_sub)
!     spam_folders = mgr.BuildFolderList(config.training.spam_folder_ids, config.training.ham_include_sub)
      num_msgs = 0
!     for f in ham_folders + spam_folders:
!         num_msgs += f.Messages.Count + 1
      progress.set_max_ticks(num_msgs+3)
  
!     for f in ham_folders:
!         progress.set_status("Processing good folder '%s'" % (f.Name.encode("ascii", "replace"),))
          train_folder(f, 0, mgr, progress)
          if progress.stop_requested():
              return
  
!     for f in spam_folders:
!         progress.set_status("Processing spam folder '%s'" % (f.Name.encode("ascii", "replace"),))
          train_folder(f, 1, mgr, progress)
          if progress.stop_requested():
--- 28,48 ----
          return
      progress.set_status("Counting messages")
! 
      num_msgs = 0
!     for f in mgr.message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub):
!         num_msgs += f.count
!     for f in mgr.message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
!         num_msgs += f.count
!         
      progress.set_max_ticks(num_msgs+3)
  
!     for f in mgr.message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub):
!         progress.set_status("Processing good folder '%s'" % (f.name,))
          train_folder(f, 0, mgr, progress)
          if progress.stop_requested():
              return
  
!     for f in mgr.message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
!         progress.set_status("Processing spam folder '%s'" % (f.name,))
          train_folder(f, 1, mgr, progress)
          if progress.stop_requested():