[Spambayes-checkins] spambayes/Outlook2000/sandbox
find_dupe_props.py, NONE, 1.1
Mark Hammond
mhammond at users.sourceforge.net
Sun Jul 27 21:24:19 EDT 2003
Update of /cvsroot/spambayes/spambayes/Outlook2000/sandbox
In directory sc8-pr-cvs1:/tmp/cvs-serv14316
Added Files:
find_dupe_props.py
Log Message:
Little tool to find messages with duplicate property values. Useful
to find message that SpamBayes will consider duplicate.
Example, to messages with the same PR_SEARCH_KEY property in 2 folders:
% find_dupe_props.py -f "\Good spam-looing mail" -f Inbox PR_SEARCH_KEY
Folder '... mail': 48 items with the property and 0 items without it
Folder 'Inbox': 1699 items with the property and 0 items without it
Found 2 items with property value '...fK\xb0\x95\xb9\xa7s\xce\x93\xcd'
Courtesy notification - 50% of your current month's allowance used
Courtesy notification - 50% of your current month's allowance used
...
This output shows the folders scanned, and the subjects for all messages
with an identical PR_SEARCH_KEY property (which is the property used by
SpamBayes to track training data). This shows that 2 items with identical
subjects have the same ID - ie, these are copies of the same message.
--- NEW FILE: find_dupe_props.py ---
from __future__ import generators
# Dump every property we can find for a MAPI item
import pythoncom
import os, sys
from win32com.mapi import mapi, mapiutil
from win32com.mapi.mapitags import *
import mapi_driver
def FindDupeProps(driver, mapi_folder, prop_tag, dupe_dict):
hr, data = mapi_folder.GetProps( (PR_DISPLAY_NAME_A,), 0)
name = data[0][1]
try:
prop_tag = int(prop_tag)
except ValueError:
# See if a constant in mapitags.
if prop_tag.startswith("PR_") and prop_tag in globals():
prop_tag = globals()[prop_tag]
else:
props = ( (mapi.PS_PUBLIC_STRINGS, prop_tag), )
ids = mapi_folder.GetIDsFromNames(props, 0)
if PROP_ID(ids[0])==0:
print "Could not resolve property '%s'" % prop_tag
return 1
prop_tag = PROP_TAG( PT_UNSPECIFIED, PROP_ID(ids[0]))
num_with_prop = num_without_prop = 0
for item in driver.GetAllItems(mapi_folder):
hr, data = item.GetProps( (prop_tag,PR_SUBJECT_A, PR_ENTRYID), 0)
if hr==0:
(tag_hr, tag_data) = data[0]
(subject_hr, subject_data) = data[1]
(eid_hr, eid_data) = data[2]
dupe_dict.setdefault(tag_data, []).append((eid_data, subject_data))
num_with_prop += 1
else:
num_without_prop += 1
print "Folder '%s': %d items with the property and %d items without it" \
% (name, num_with_prop, num_without_prop)
def DumpDupes(dupe_dict):
for val, items in dupe_dict.items():
if len(items)>1:
print "Found %d items with property value %r" % (len(items), val)
for (eid, subject) in items:
print "", subject
def usage(driver):
folder_doc = driver.GetFolderNameDoc()
msg = """\
Usage: %s [-f foldername] [-f ...] property_name_or_tag
-f - Search for the message in the specified folders (default = Inbox)
-n - Show top-level folder names and exit
Dumps all properties for all messages that match the subject. Subject
matching is substring and ignore-case.
%s
Use the -n option to see all top-level folder names from all stores.""" \
% (os.path.basename(sys.argv[0]),folder_doc)
print msg
def main():
driver = mapi_driver.MAPIDriver()
import getopt
try:
opts, args = getopt.getopt(sys.argv[1:], "f:n")
except getopt.error, e:
print e
print
usage(driver)
sys.exit(1)
folder_names = []
for opt, opt_val in opts:
if opt == "-f":
folder_names.append(opt_val)
elif opt == "-n":
driver.DumpTopLevelFolders()
sys.exit(1)
else:
print "Invalid arg"
return
if not folder_names:
folder_names = ["Inbox"] # Assume this exists!
if len(args) != 1:
print "You must specify a property tag/name"
print
usage(driver)
sys.exit(1)
dupe_dict = {}
for folder_name in folder_names:
try:
folder = driver.FindFolder(folder_name)
except ValueError, details:
print details
sys.exit(1)
FindDupeProps(driver, folder, args[0], dupe_dict)
DumpDupes(dupe_dict)
if __name__=='__main__':
main()
More information about the Spambayes-checkins
mailing list