[Spambayes-checkins]
spambayes/Outlook2000 msgstore.py,NONE,1.1 addin.py,1.13,1.14
classify.py,1.8,1.9 filter.py,1.9,1.10 manager.py,1.17,1.18
rule.py,1.4,1.5 train.py,1.5,1.6
Mark Hammond
mhammond@users.sourceforge.net
Thu, 24 Oct 2002 06:06:41 -0700
Update of /cvsroot/spambayes/spambayes/Outlook2000
In directory usw-pr-cvs1:/tmp/cvs-serv19862
Modified Files:
addin.py classify.py filter.py manager.py rule.py train.py
Added Files:
msgstore.py
Log Message:
Had a nice log message written - 3rd go at checkin in.
Use extended MAPI. Much faster. Nearly got rid of MAPI.Session.
Must sleep <snore>
--- NEW FILE: msgstore.py ---
from __future__ import generators
import sys, os
# Abstract definition - can be moved out when we have more than one sub-class <wink>
# External interface to this module is almost exclusively via a "folder ID"
class MsgStoreException(Exception):
pass
class NotFoundException(MsgStoreException):
pass
class MsgStore:
# Stash exceptions in the class for ease of use by consumers.
MsgStoreException = MsgStoreException
NotFoundException = NotFoundException
def __init__(self):
pass
def Close(self):
# Close this object and free everything
raise NotImplementedError
def GetFolderGenerator(self, folder_ids, include_sub):
# Return a generator of MsgStoreFolder objects.
raise NotImplementedError
def GetFolder(self, folder_id):
# Return a single folder given the ID.
raise NotImplementedError
def GetMessage(self, message_id):
# Return a single message given the ID.
raise NotImplementedError
class MsgStoreFolder:
def __init__(self):
self.name = "<folder>"
self.count = 0
def GetMessageGenerator(self, folder):
# Return a generator of MsgStoreMsg objects for the folder
raise NotImplementedError
class MsgStoreMsg:
def __init__(self):
self.unread = False
def GetEmailPackageObject(self):
# Return a "read-only" Python email package object
# "read-only" in that changes will never be reflected to the real store.
raise NotImplementedError
def SetField(self, name, value):
# Abstractly set a user field name/id to a field value.
# User field is for the user to see - status/internal fields
# should get their own methods
raise NotImplementedError
def Save(self):
# Save changes after field changes.
raise NotImplementedError
def MoveTo(self, folder_id):
# Move the message to a folder.
raise NotImplementedError
def CopyTo(self, folder_id):
# Copy the message to a folder.
raise NotImplementedError
# And some status ones we may hopefully use.
def BeenFiltered(self):
# Ever been filtered by us before
raise NotImplementedError
def GetTrainedCorpaName(self):
# Return None, "ham" or "spam"
raise NotImplementedError
# Our MAPI implementation
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, append=1)
from win32com.client import Dispatch, constants
from win32com.mapi import mapi
from win32com.mapi.mapitags import *
MESSAGE_MOVE = 0x1 # from MAPIdefs.h
MYPR_BODY_HTML_A = 0x1013001e # magic <wink>
MYPR_BODY_HTML_W = 0x1013001f # ditto
USE_DEFERRED_ERRORS = mapi.MAPI_DEFERRED_ERRORS # or set to zero to see what changes <wink>
class MAPIMsgStore(MsgStore):
def __init__(self, outlook = None):
self.outlook = outlook
cwd = os.getcwd()
mapi.MAPIInitialize(None)
logonFlags = mapi.MAPI_NO_MAIL | mapi.MAPI_EXTENDED | mapi.MAPI_USE_DEFAULT
self.session = mapi.MAPILogonEx(0, None, None, logonFlags)
self._FindDefaultMessageStore()
os.chdir(cwd)
def Close(self):
self.mapi_msgstore = None
self.session.Logoff(0,0,0)
self.session = None
mapi.MAPIUninitialize()
def _FindDefaultMessageStore(self):
tab = self.session.GetMsgStoresTable(0)
# restriction for the table.
restriction = mapi.RES_PROPERTY, (mapi.RELOP_EQ, PR_DEFAULT_STORE, (PR_DEFAULT_STORE, True))
rows = mapi.HrQueryAllRows(tab, (PR_ENTRYID,), restriction, None, 0)
# get first entry
row = rows[0]
eid_tag, eid = row[0]
# Open the store.
self.mapi_msgstore = self.session.OpenMsgStore(0, eid, None, mapi.MDB_WRITE | mapi.MDB_NO_MAIL | USE_DEFERRED_ERRORS )
def _GetSubFolderIter(self, folder):
table = folder.GetHierarchyTable(0)
rows = mapi.HrQueryAllRows(table, (PR_ENTRYID,PR_DISPLAY_NAME_A), None, None, 0)
for (eid_tag, eid),(name_tag, name) in rows:
sub = self.mapi_msgstore.OpenEntry(eid, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
table = sub.GetContentsTable(0)
yield MAPIMsgStoreFolder(self, eid, name, table.GetRowCount(0))
folder = self.mapi_msgstore.OpenEntry(eid, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
for store_folder in self._GetSubFolderIter(folder):
yield store_folder
def GetFolderGenerator(self, folder_ids, include_sub):
for folder_id in folder_ids:
folder_id = mapi.BinFromHex(folder_id)
folder = self.mapi_msgstore.OpenEntry(folder_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
table = folder.GetContentsTable(0)
rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0)
yield MAPIMsgStoreFolder(self, folder_id, props[0][1], table.GetRowCount(0))
if include_sub:
for f in self._GetSubFolderIter(folder):
yield f
def GetFolder(self, folder_id):
# Return a single folder given the ID.
folder_id = mapi.BinFromHex(folder_id)
folder = self.mapi_msgstore.OpenEntry(folder_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
table = folder.GetContentsTable(0)
rc, props = folder.GetProps( (PR_DISPLAY_NAME_A,), 0)
return MAPIMsgStoreFolder(self, folder_id, props[0][1], table.GetRowCount(0))
def GetMessage(self, message_id):
# Return a single message given the ID.
message_id = mapi.BinFromHex(message_id)
prop_ids = PR_PARENT_ENTRYID, PR_CONTENT_UNREAD
mapi_object = self.mapi_msgstore.OpenEntry(message_id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
hr, data = mapi_object.GetProps(prop_ids,0)
folder_eid = data[0][1]
unread = data[1][1]
folder = MAPIMsgStoreFolder(self, folder_eid, "Unknown - temp message", -1)
return MAPIMsgStoreMsg(self, folder, message_id, unread)
def GetOutlookObjectFromID(self, eid):
if self.outlook is None:
from win32com.client import Dispatch
self.outlook = Dispatch("Outlook.Application")
return self.outlook.Session.GetItemFromID(mapi.HexFromBin(eid))
_MapiTypeMap = {
type(0.0): PT_DOUBLE,
type(0): PT_I4,
type(''): PT_STRING8,
type(u''): PT_UNICODE,
type(1==1): PT_BOOLEAN,
}
class MAPIMsgStoreFolder(MsgStoreMsg):
def __init__(self, msgstore, id, name, count):
self.msgstore = msgstore
self.id = id
self.name = name
self.count = count
def __repr__(self):
return "<%s '%s' (%d items), id=%s>" % (self.__class__.__name__, self.name, self.count, mapi.HexFromBin(self.id))
def GetOutlookEntryID(self):
return mapi.HexFromBin(self.id)
def GetMessageGenerator(self):
folder = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
table = folder.GetContentsTable(0)
prop_ids = PR_ENTRYID, PR_CONTENT_UNREAD
table.SetColumns(prop_ids, 0)
while 1:
# Getting 70 at a time was the random number that gave best perf for me ;)
rows = table.QueryRows(70, 0)
if len(rows)==0:
break
for row in rows:
yield MAPIMsgStoreMsg(self.msgstore, self, row[0][1], row[1][1])
class MAPIMsgStoreMsg(MsgStoreMsg):
def __init__(self, msgstore, folder, entryid, unread):
self.folder = folder
self.msgstore = msgstore
self.mapi_object = None
self.id = entryid
self.unread = unread
self.dirty = False
def __repr__(self):
urs = ["read", "unread"][self.unread]
return "<%s, (%s) id=%s>" % (self.__class__.__name__, urs, mapi.HexFromBin(self.id))
def GetOutlookEntryID(self):
return mapi.HexFromBin(self.id)
def _GetMessageText(self):
self._EnsureObject()
prop_ids = PR_TRANSPORT_MESSAGE_HEADERS_A, PR_BODY, MYPR_BODY_HTML_A
hr, data = self.mapi_object.GetProps(prop_ids,0)
headers = data[0][1]
if type(headers) != type(''): headers = '' # If no field will be an int error (the tag([0]) would tell us, but this is easier)
body = data[1][1]
if type(body) != type(''): body= '' # If no field will be an int error (the tag([0]) would tell us, but this is easier)
# Messages with "text/html" and "multipart/*" give grief.
# In some cases, the HTML body appears *only* accessible via Outlook :( Outlook is slow, so try and avoid
# Tried using the "_W" props, and indeed tried dumping every prop - these HTML messages are hidden from Mapi!
if PROP_TYPE(data[2][0])==PT_ERROR:
# No HTML body - see if one of our problem children.
html = ""
lo_headers = headers.lower()
if lo_headers.find("content-type: text/html")>=0 or lo_headers.find("content-type: multipart/")>=0:
outlook_msg = self.msgstore.GetOutlookObjectFromID(self.id)
html = outlook_msg.HTMLBody.encode("ascii", "replace")
else:
html = data[2][1]
return headers + "\n" + html + "\n" + body
def _EnsureObject(self):
if self.mapi_object is None:
self.mapi_object = self.msgstore.mapi_msgstore.OpenEntry(self.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
def GetEmailPackageObject(self):
import email
# XXX If this was originally a MIME msg, we're hosed at this point --
# the boundary tag in the headers doesn't exist in the body, and
# the msg is simply ill-formed. The miserable hack here simply
# squashes the text part (if any) and the HTML part (if any) together,
# and strips MIME info from the original headers.
text = self._GetMessageText()
try:
msg = email.message_from_string(text)
except:
print "FAILED to create email.message from: ", `text`
raise
if msg.has_key('content-type'):
del msg['content-type']
if msg.has_key('content-transfer-encoding'):
del msg['content-transfer-encoding']
return msg
def SetField(self, prop, val):
self._EnsureObject()
if type(prop)!=type(0):
props = ( (mapi.PS_PUBLIC_STRINGS, prop), )
propIds = self.mapi_object.GetIDsFromNames(props, mapi.MAPI_CREATE)
type_tag = _MapiTypeMap.get(type(val))
if type_tag is None:
raise ValueError, "Dont know what to do with '%r' ('%s')" % (val, type(val))
prop = PROP_TAG( type_tag, PROP_ID(propIds[0]))
if val is None:
# Delete the property
self.mapi_object.DeleteProps((prop,))
else:
self.mapi_object.SetProps(((prop,val),))
self.dirty = True
def Save(self):
assert self.dirty, "asking me to save a clean message!"
self.mapi_object.SaveChanges(mapi.KEEP_OPEN_READWRITE)
self.dirty = False
def _DoCopyMode(self, folder, isMove):
## self.mapi_object = None # release the COM pointer
assert not self.dirty, "asking me to move a dirty message - later saves will fail!"
dest_folder = self.msgstore.mapi_msgstore.OpenEntry(folder.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
source_folder = self.msgstore.mapi_msgstore.OpenEntry(self.folder.id, None, mapi.MAPI_MODIFY | USE_DEFERRED_ERRORS)
flags = 0
if isMove: flags |= MESSAGE_MOVE
source_folder.CopyMessages( (self.id,), None, dest_folder, 0, None, flags)
self.folder = self.msgstore.GetFolder(mapi.HexFromBin(folder.id))
def MoveTo(self, folder):
self._DoCopyMode(folder, True)
def CopyTo(self, folder):
self._DoCopyMode(folder, True)
def test():
from win32com.client import Dispatch
outlook = Dispatch("Outlook.Application")
eid = outlook.Session.GetDefaultFolder(constants.olFolderInbox).EntryID
store = MAPIMsgStore()
for folder in store.GetFolderGenerator([eid,], True):
print folder
for msg in folder.GetMessageGenerator():
print msg
store.Close()
if __name__=='__main__':
test()
Index: addin.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** addin.py 24 Oct 2002 04:22:19 -0000 1.13
--- addin.py 24 Oct 2002 13:06:39 -0000 1.14
***************
*** 1,3 ****
! # Mark's Outlook addin
import sys
--- 1,3 ----
! # SpamBayes Outlook Addin
import sys
***************
*** 117,123 ****
def OnItemAdd(self, item):
if self.manager.config.filter.enabled:
! mapi_message = self.manager.mapi.GetMessage(item.EntryID)
import filter
! num_rules = filter.filter_message(mapi_message, self.manager)
print "%d Spam rules fired for message '%s'" \
% (num_rules, item.Subject.encode("ascii", "replace"))
--- 117,123 ----
def OnItemAdd(self, item):
if self.manager.config.filter.enabled:
! msgstore_message = self.manager.message_store.GetMessage(item.EntryID)
import filter
! num_rules = filter.filter_message(msgstore_message, self.manager)
print "%d Spam rules fired for message '%s'" \
% (num_rules, item.Subject.encode("ascii", "replace"))
***************
*** 142,148 ****
return
! mapi_message = mgr.mapi.GetMessage(item.EntryID)
! stream = mgr.GetBayesStreamForMessage(mapi_message)
! prob, clues = mgr.score(stream, evidence=True)
new_msg = app.CreateItem(0)
body = ["<h2>Spam Score: %g</h2><br>" % prob]
--- 142,147 ----
return
! msgstore_message = mgr.message_store.GetMessage(item.EntryID)
! prob, clues = mgr.score(msgstore_message, evidence=True)
new_msg = app.CreateItem(0)
body = ["<h2>Spam Score: %g</h2><br>" % prob]
***************
*** 155,158 ****
--- 154,164 ----
push(' %g\n' % prob)
push("</PRE>\n")
+ # Now the raw text of the message, as best we can
+ push("<h2>Message Stream:</h2><br>")
+ push("<PRE>\n")
+ txt = msgstore_message.GetEmailPackageObject().as_string(unixfrom=1)
+ import cgi
+ push(cgi.escape(txt, True))
+ push("</PRE>\n")
body = ''.join(body)
***************
*** 184,188 ****
# Create our bayes manager
import manager
! self.manager = manager.GetManager()
assert self.manager.addin is None, "Should not already have an addin"
self.manager.addin = self
--- 190,194 ----
# Create our bayes manager
import manager
! self.manager = manager.GetManager(application)
assert self.manager.addin is None, "Should not already have an addin"
self.manager.addin = self
***************
*** 229,239 ****
def UpdateFolderHooks(self):
new_hooks = {}
! for mapi_folder in self.manager.BuildFolderList(
self.manager.config.filter.folder_ids,
self.manager.config.filter.include_sub):
! eid = mapi_folder.ID
existing = self.folder_hooks.get(eid)
if existing is None:
! folder = self.application.GetNamespace("MAPI").GetFolderFromID(eid)
try:
new_hook = DispatchWithEvents(folder.Items, FolderItemsEvent)
--- 235,245 ----
def UpdateFolderHooks(self):
new_hooks = {}
! for msgstore_folder in self.manager.message_store.GetFolderGenerator(
self.manager.config.filter.folder_ids,
self.manager.config.filter.include_sub):
! eid = msgstore_folder.GetOutlookEntryID()
existing = self.folder_hooks.get(eid)
if existing is None:
! folder = self.application.Session.GetFolderFromID(eid)
try:
new_hook = DispatchWithEvents(folder.Items, FolderItemsEvent)
Index: classify.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/classify.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** classify.py 24 Oct 2002 04:22:19 -0000 1.8
--- classify.py 24 Oct 2002 13:06:39 -0000 1.9
***************
*** 1,4 ****
! # Train a classifier from Outlook Mail folders
! # Author: Sean D. True, WebReply.Com
# October, 2002
# Copyright PSF, license under the PSF license
--- 1,4 ----
! # Classify a folder with a field
! # Authors: Sean D. True, WebReply.Com, Mark Hammond.
# October, 2002
# Copyright PSF, license under the PSF license
***************
*** 10,52 ****
def classify_folder( f, mgr, config, progress):
! messages = f.Messages
! pythoncom.CoInitialize() # We are called on a different thread.
! # We must get outlook in this thread - can't use the main thread :(
! outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI")
!
! if not messages:
! progress.warning("Can't find messages in folder '%s'" % (f.Name,))
! return
! message = messages.GetFirst()
! while not progress.stop_requested() and message:
! try:
! progress.tick()
! stream = mgr.GetBayesStreamForMessage(message)
! prob = mgr.score(stream)
! added_prop = False
! try:
! if outlook_ns is not None:
! outlookItem = outlook_ns.GetItemFromID(message.ID)
! format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number.
! prop = outlookItem.UserProperties.Add(config.field_name, constants.olNumber, True, format)
! prop.Value = prob
! outlookItem.Save()
! added_prop = True
! except "foo": # pythoncom.com_error, d:
! # Hrm - don't seem able to use outlook - use MAPI - but this
! # means the field doesn't automatically appear in the outlook "Field Chooser"
! # Tried explicity adding the field to the folder but still no go.
! added_prop = False
! if not added_prop:
! message.Fields.Add(config.field_name, 5, prob)
!
! message.Update()
! except pythoncom.com_error, d:
! progress.warning("Failed to get a message: %s" % (str(d),) )
! message = messages.GetNext()
# Called back from the dialog to do the actual training.
def classifier(mgr, progress):
- session = mgr.mapi
config = mgr.config.classify
if not config.folder_ids:
--- 10,23 ----
def classify_folder( f, mgr, config, progress):
! for message in f.GetMessageGenerator():
! if progress.stop_requested():
! break
! progress.tick()
! prob = mgr.score(message)
! message.SetField(config.field_name, prob)
! message.Save()
# Called back from the dialog to do the actual training.
def classifier(mgr, progress):
config = mgr.config.classify
if not config.folder_ids:
***************
*** 54,68 ****
return
progress.set_status("Counting messages")
- folders = mgr.BuildFolderList(config.folder_ids, config.include_sub)
num_msgs = 0
! for f in folders:
! num_msgs += f.Messages.Count + 1
progress.set_max_ticks(num_msgs+3)
! for f in folders:
! progress.set_status("Processing folder '%s'" % (f.Name.encode("ascii", "replace"),))
classify_folder(f, mgr, config, progress)
if progress.stop_requested():
return
--- 25,39 ----
return
progress.set_status("Counting messages")
num_msgs = 0
! for f in mgr.message_store.GetFolderGenerator(config.folder_ids, config.include_sub):
! num_msgs += f.count
progress.set_max_ticks(num_msgs+3)
! for f in mgr.message_store.GetFolderGenerator(config.folder_ids, config.include_sub):
! progress.set_status("Processing folder '%s'" % (f.name,))
classify_folder(f, mgr, config, progress)
if progress.stop_requested():
return
+ progress.set_status("Classified %d messages." % (num_msgs,))
Index: filter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/filter.py,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** filter.py 24 Oct 2002 04:22:19 -0000 1.9
--- filter.py 24 Oct 2002 13:06:39 -0000 1.10
***************
*** 5,20 ****
import sys, os
- from win32com.client import Dispatch, constants
- import pythoncom
import rule
def filter_message(message, mgr):
! try:
! stream = mgr.GetBayesStreamForMessage(message)
! except pythoncom.com_error, d:
! print "Failed to get a message: %s" % (d,)
! return
!
! prob = mgr.score(stream)
num_rules = 0
for rule in mgr.config.rules:
--- 5,12 ----
import sys, os
import rule
def filter_message(message, mgr):
! prob = mgr.score(message)
num_rules = 0
for rule in mgr.config.rules:
***************
*** 32,40 ****
only_unread = filter.only_unread
num_messages = 0
! for message in mgr.YieldMessageList(f):
if progress.stop_requested():
break
progress.tick()
! if only_unread and not message.Unread:
continue
if filter_message(message, mgr):
--- 24,32 ----
only_unread = filter.only_unread
num_messages = 0
! for message in f.GetMessageGenerator():
if progress.stop_requested():
break
progress.tick()
! if only_unread and not message.unread:
continue
if filter_message(message, mgr):
***************
*** 50,61 ****
progress.set_status("Counting messages")
- folders = mgr.BuildFolderList(filter.folder_ids, filter.include_sub)
num_msgs = 0
! for f in folders:
! num_msgs += f.Messages.Count + 1
progress.set_max_ticks(num_msgs+3)
num = 0
! for f in folders:
! progress.set_status("Filtering folder '%s'" % (f.Name.encode("ascii", "replace"),))
num += filter_folder(f, mgr, progress, filter)
if progress.stop_requested():
--- 42,52 ----
progress.set_status("Counting messages")
num_msgs = 0
! for f in mgr.message_store.GetFolderGenerator(filter.folder_ids, filter.include_sub):
! num_msgs += f.count
progress.set_max_ticks(num_msgs+3)
num = 0
! for f in mgr.message_store.GetFolderGenerator(filter.folder_ids, filter.include_sub):
! progress.set_status("Filtering folder '%s'" % (f.name))
num += filter_folder(f, mgr, progress, filter)
if progress.stop_requested():
Index: manager.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v
retrieving revision 1.17
retrieving revision 1.18
diff -C2 -d -r1.17 -r1.18
*** manager.py 24 Oct 2002 04:58:52 -0000 1.17
--- manager.py 24 Oct 2002 13:06:39 -0000 1.18
***************
*** 11,14 ****
--- 11,15 ----
import config
+ import msgstore
try:
***************
*** 64,68 ****
self.mapi = win32com.client.Dispatch("MAPI.Session")
self.mapi.Logon(None, None, False, False)
- self._tls = {thread.get_ident(): {"outlook": outlook} }
self.outlook = outlook
os.chdir(cwd)
--- 65,68 ----
***************
*** 70,119 ****
import_core_spambayes_stuff(self.ini_filename)
self.LoadBayes()
# Outlook gives us thread grief :(
def WorkerThreadStarting(self):
pythoncom.CoInitialize()
- self._tls[thread.get_ident()] = {}
def WorkerThreadEnding(self):
- assert self._tls.has_key(thread.get_ident()), \
- "WorkerThreadStarting hasn't been called for this thread"
- del self._tls[thread.get_ident()]
pythoncom.CoUninitialize()
- def GetOutlookForCurrentThread(self):
- assert self._tls.has_key(thread.get_ident()), \
- "WorkerThreadStarting hasn't been called for this thread"
- existing = self._tls[thread.get_ident()].get("outlook")
- if not existing:
- existing = win32com.client.Dispatch("Outlook.Application")
- self._tls[thread.get_ident()]["outlook"] = existing
- return existing
-
- def GetBayesStreamForMessage(self, message):
- # Note - caller must catch COM error
- import email
-
- headers = message.Fields[0x7D001E].Value
- headers = headers.encode('ascii', 'replace')
- try:
- body = message.Fields[0x1013001E].Value # HTMLBody field
- body = body.encode("ascii", "replace") + "\n"
- except pythoncom.error:
- body = ""
- body += message.Text.encode("ascii", "replace")
-
- # XXX If this was originally a MIME msg, we're hosed at this point --
- # the boundary tag in the headers doesn't exist in the body, and
- # the msg is simply ill-formed. The miserable hack here simply
- # squashes the text part (if any) and the HTML part (if any) together,
- # and strips MIME info from the original headers.
- msg = email.message_from_string(headers + '\n' + body)
- if msg.has_key('content-type'):
- del msg['content-type']
- if msg.has_key('content-transfer-encoding'):
- del msg['content-transfer-encoding']
- return msg
-
def LoadBayes(self):
if not os.path.exists(self.ini_filename):
--- 70,82 ----
import_core_spambayes_stuff(self.ini_filename)
self.LoadBayes()
+ self.message_store = msgstore.MAPIMsgStore(outlook)
# Outlook gives us thread grief :(
def WorkerThreadStarting(self):
pythoncom.CoInitialize()
def WorkerThreadEnding(self):
pythoncom.CoUninitialize()
def LoadBayes(self):
if not os.path.exists(self.ini_filename):
***************
*** 123,126 ****
--- 86,92 ----
bayes = None
try:
+ # Ooops - Tim did it another way - checking this in before I get more conficts!
+ ## from Options import options
+ ## options.mergefiles([self.ini_filename])
bayes = cPickle.load(open(self.bayes_filename, 'rb'))
print "Loaded bayes database from '%s'" % (self.bayes_filename,)
***************
*** 169,172 ****
--- 135,141 ----
def InitNewBayes(self):
+ # Ooops - Tim did it another way - checking this in before I get more conficts!
+ ## from Options import options
+ ## options.mergefiles([self.ini_filename])
self.bayes = bayes_classifier.Bayes()
self.bayes_dirty = True
***************
*** 203,246 ****
self.bayes = None
self.config = None
! self._tls = None
!
! def BuildFolderList(self, folder_ids, include_sub):
! ret = {}
! for id in folder_ids:
! subs = []
! try:
! f = self.mapi.GetFolder(id)
! if include_sub:
! sub_ids = []
! subs = f.Folders
! for i in range(1, subs.Count):
! sub_ids.append(subs.Item(i).ID)
! subs = self.BuildFolderList(sub_ids, True)
! except pythoncom.error:
! continue
! ret[id] = f
! for sub in subs:
! ret[sub.ID] = sub
! return ret.values()
!
! def YieldMessageList(self, folder):
! messages = folder.Messages
! if not messages:
! print "Can't find messages in folder '%s'" % (folder.Name,)
! return
! message = messages.GetFirst()
! while message is not None:
! yield message
! message = messages.GetNext()
def score(self, msg, evidence=False):
return self.bayes.spamprob(bayes_tokenize(msg), evidence)
_mgr = None
! def GetManager(verbose=1):
global _mgr
if _mgr is None:
! _mgr = BayesManager(verbose=verbose)
# If requesting greater verbosity, honour it
if verbose > _mgr.verbose:
--- 172,192 ----
self.bayes = None
self.config = None
! if self.message_store is not None:
! self.message_store.Close()
! self.message_store = None
def score(self, msg, evidence=False):
+ email = msg.GetEmailPackageObject()
+ # As Tim suggested in email, score should move to range(100)
+ # This is probably a good place to do it - anyone who wants the real
+ # float value can look at the "clues"
return self.bayes.spamprob(bayes_tokenize(msg), evidence)
_mgr = None
! def GetManager(outlook = None, verbose=1):
global _mgr
if _mgr is None:
! _mgr = BayesManager(outlook=outlook, verbose=verbose)
# If requesting greater verbosity, honour it
if verbose > _mgr.verbose:
Index: rule.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/rule.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** rule.py 20 Oct 2002 18:53:06 -0000 1.4
--- rule.py 24 Oct 2002 13:06:39 -0000 1.5
***************
*** 3,8 ****
import time
- MAPI_E_NOT_FOUND = -2147221233
-
class Rule:
def __init__(self):
--- 3,6 ----
***************
*** 14,18 ****
self.flag_message = True
self.write_field = True
! self.write_field_name = "SpamProb"
self.folder_id = ""
--- 12,16 ----
self.flag_message = True
self.write_field = True
! self.write_field_name = "SpamScore"
self.folder_id = ""
***************
*** 34,79 ****
return "You must specify the field name to create"
- def _GetFolder(self, mgr):
- try:
- return mgr.mapi.GetFolder(self.folder_id)
- except pythoncom.com_error:
- return None
-
def Act(self, mgr, msg, prob):
if mgr.verbose > 1:
! print "Rule '%s': %.2f->%.2f (%.2f) (%s)" % (self.name, self.min, self.max, prob, msg.Subject[:20].encode("ascii", "replace"))
if prob < self.min or prob > self.max:
return False
- # Do mods before we move.
- dirty = False
- outlook_ns = mgr.GetOutlookForCurrentThread().GetNamespace("MAPI")
- try:
- outlook_message = outlook_ns.GetItemFromID(msg.ID)
- except pythoncom.com_error, (hr, desc, exc, arg):
- if not exc or exc[5] != MAPI_E_NOT_FOUND:
- raise
- print "Warning: Can't open the message - it has probably been moved"
- return False
! if self.flag_message:
! outlook_message.FlagRequest = "Check Spam"
! outlook_message.FlagStatus = constants.olFlagMarked
! dirty = True
if self.write_field:
! format = 4 # 4=2 decimal, 3=1 decimal - index in "field chooser" combo when type=Number.
! prop = outlook_message.UserProperties.Add(self.write_field_name, constants.olNumber, True, format)
! prop.Value = prob
! dirty = True
! if dirty:
! outlook_message.Save()
if self.action == "None":
pass
elif self.action == "Copy":
! outlook_message.Copy(outlook_ns.GetFolderFromID(self.folder_id))
elif self.action == "Move":
! outlook_message.Move(outlook_ns.GetFolderFromID(self.folder_id))
else:
! print "Eeek - bad action", self.action
return True
--- 32,60 ----
return "You must specify the field name to create"
def Act(self, mgr, msg, prob):
if mgr.verbose > 1:
! print "Rule '%s': %.2f->%.2f (%.2f) (%s)" % (self.name, self.min, self.max, prob, repr(msg))
if prob < self.min or prob > self.max:
return False
! ## if self.flag_message:
! ## outlook_message.FlagRequest = "Check Spam"
! ## outlook_message.FlagStatus = constants.olFlagMarked
! ## dirty = True
!
if self.write_field:
! msg.SetField(self.write_field_name, prob)
! msg.Save()
if self.action == "None":
pass
elif self.action == "Copy":
! dest_folder = mgr.message_store.GetFolder(self.folder_id)
! msg.CopyTo(dest_folder)
elif self.action == "Move":
! dest_folder = mgr.message_store.GetFolder(self.folder_id)
! msg.MoveTo(dest_folder)
else:
! assert 0, "Eeek - bad action '%r'" % (self.action,)
return True
Index: train.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** train.py 20 Oct 2002 23:51:04 -0000 1.5
--- train.py 24 Oct 2002 13:06:39 -0000 1.6
***************
*** 1,38 ****
# Train a classifier from Outlook Mail folders
! # Author: Sean D. True, WebReply.Com, Mark Hammond
# October, 2002
# Copyright PSF, license under the PSF license
! import sys, os, os.path, getopt, cPickle, string
! import win32com.client
! import pythoncom
! import win32con
def train_folder( f, isspam, mgr, progress):
from tokenizer import tokenize
num = 0
! for message in mgr.YieldMessageList(f):
if progress.stop_requested():
break
progress.tick()
! try:
! # work with MAPI until we work out how to get headers from outlook
! message = mgr.mapi.GetMessage(message.ID)
! stream = mgr.GetBayesStreamForMessage(message)
! except pythoncom.com_error, d:
! progress.warning("failed to get a message")
! print "Failed to get a message", d
! continue
mgr.bayes.learn(tokenize(stream), isspam, False)
num += 1
! print "Trained over", num, "in folder", f.Name
# Called back from the dialog to do the actual training.
def trainer(mgr, progress):
- pythoncom.CoInitialize()
config = mgr.config
mgr.InitNewBayes()
bayes = mgr.bayes
- session = mgr.mapi
if not config.training.ham_folder_ids or not config.training.spam_folder_ids:
--- 1,26 ----
# Train a classifier from Outlook Mail folders
! # Authors: Sean D. True, WebReply.Com, Mark Hammond
# October, 2002
# Copyright PSF, license under the PSF license
! import sys, os
def train_folder( f, isspam, mgr, progress):
from tokenizer import tokenize
num = 0
! for message in f.GetMessageGenerator():
if progress.stop_requested():
break
progress.tick()
! stream = message.GetEmailPackageObject()
mgr.bayes.learn(tokenize(stream), isspam, False)
num += 1
! print "Trained over", num, "in folder", f.name
# Called back from the dialog to do the actual training.
def trainer(mgr, progress):
config = mgr.config
mgr.InitNewBayes()
bayes = mgr.bayes
if not config.training.ham_folder_ids or not config.training.spam_folder_ids:
***************
*** 40,58 ****
return
progress.set_status("Counting messages")
! ham_folders = mgr.BuildFolderList(config.training.ham_folder_ids, config.training.ham_include_sub)
! spam_folders = mgr.BuildFolderList(config.training.spam_folder_ids, config.training.ham_include_sub)
num_msgs = 0
! for f in ham_folders + spam_folders:
! num_msgs += f.Messages.Count + 1
progress.set_max_ticks(num_msgs+3)
! for f in ham_folders:
! progress.set_status("Processing good folder '%s'" % (f.Name.encode("ascii", "replace"),))
train_folder(f, 0, mgr, progress)
if progress.stop_requested():
return
! for f in spam_folders:
! progress.set_status("Processing spam folder '%s'" % (f.Name.encode("ascii", "replace"),))
train_folder(f, 1, mgr, progress)
if progress.stop_requested():
--- 28,48 ----
return
progress.set_status("Counting messages")
!
num_msgs = 0
! for f in mgr.message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub):
! num_msgs += f.count
! for f in mgr.message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
! num_msgs += f.count
!
progress.set_max_ticks(num_msgs+3)
! for f in mgr.message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub):
! progress.set_status("Processing good folder '%s'" % (f.name,))
train_folder(f, 0, mgr, progress)
if progress.stop_requested():
return
! for f in mgr.message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
! progress.set_status("Processing spam folder '%s'" % (f.name,))
train_folder(f, 1, mgr, progress)
if progress.stop_requested():