[Spambayes-checkins] spambayes imapfilter.py,1.2,1.3

Tony Meyer anadelonbrin at users.sourceforge.net
Tue Apr 8 01:37:31 EDT 2003


Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv4387

Modified Files:
	imapfilter.py 
Log Message:
Introduces an IMAPMessage class based on the
spambayes Message class.
Introduces an iterable IMAPFolder class.
Changes the code to use all of this.
Changed to allow multiple folders to filter.
Training seems to work, although filtering isn't.  I'll get to it.

Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** imapfilter.py	8 Apr 2003 01:24:17 -0000	1.2
--- imapfilter.py	8 Apr 2003 07:37:28 -0000	1.3
***************
*** 1,4 ****
--- 1,6 ----
  #!/usr/bin/env python
  
+ from __future__ import generators
+ 
  """An IMAP filter.  An IMAP message box is scanned and all non-scored
  messages are scored and (where necessary) filtered.
***************
*** 18,28 ****
  
  __author__ = "Tony Meyer <ta-meyer at ihug.co.nz>"
! __credits__ = "All the Spambayes folk."
  
! # This code will benefit immensely from
! # (a) The new message class, which can hold information such as
! #     whether a message has been seen before
! # (b) The new header stuff, which will abstract out adding all
! #     the headers
  
  try:
--- 20,31 ----
  
  __author__ = "Tony Meyer <ta-meyer at ihug.co.nz>"
! __credits__ = "Tim Stone, All the Spambayes folk."
  
! # Tony thinks it would be nice if there was a web ui to
! # this for the initial setup (i.e. like pop3proxy), which offered
! # a list of folders to filter/train/etc.  It could then record a
! # uid for the folder rather than a name, and it avoids the problems
! # with different imap servers having different naming styles
! # a list is retrieved via imap.list()
  
  try:
***************
*** 37,47 ****
  import re
  import time
  
  from spambayes.Options import options
! from spambayes import tokenizer, storage
  
  class IMAPFilter(object):
      def __init__(self):
!         self.imap = imaplib.IMAP4(options.imap_server, options.imap_port)
          if options.verbose:
              print "Loading database...",
--- 40,138 ----
  import re
  import time
+ import sys
  
  from spambayes.Options import options
! from spambayes import tokenizer, storage, message
! 
! # global IMAPlib object
! imap = None
! 
! class IMAPMessage(message.Message):
!     # response checking is necessary throughout this class
!     def __init__(self, folder_id, folder_name, message_id):
!         message.Message.__init__(self)
!         self.setId(message_id)
!         self.folder_id = folder_id
!         self.folder_name = folder_name
! 
!     def extractTime(self):
!         # When we create a new copy of a message, we need to specify
!         # a timestamp for the message.  Ideally, this would be the
!         # timestamp from the message itself, but for the moment, we
!         # just use the current time.
!         return imaplib.Time2Internaldate(time.time())
! 
!     def Update(self):
!         # we can't actually update the message with IMAP
!         # so what we do is create a new message and delete the old one
!         response = imap.append(self.folder_name, None,
!                                self.extractTime(), self.get_payload())
!         response = imap.select(self.folder_name, False)
!         response = imap.uid("STORE", self.getId(), "+FLAGS.SILENT",
!                                  "(\\Deleted)")
!         # we need to update the uid, as it will have changed
!         response = imap.uid("SEARCH", "(TEXT)", self.get_payload())
!         self.changeId(response[1][0])
! 
! 
! class IMAPFolder(object):
!     # response checking is necessary throughout this class
!     def __init__(self, folder_name, readOnly=True):
!         self.name = folder_name
!         # Convert folder name to a uid
!         self.uid = None
!         response = imap.select(self.name, readOnly)
!         responses = imap.response("OK")[1]
!         for response in responses:
!             if response[:13] == "[UIDVALIDITY ":
!                 r = re.compile(r"(?P<uid>\d+)")
!                 self.uid = r.search(response[13:]).group('uid')
!         # We really want to use RFC822.PEEK here, as that doesn't effect
!         # the status of the message.  Unfortunately, it appears that not
!         # all IMAP servers support this, even though it is in RFC1730
!         self.rfc822_command = "(RFC822.PEEK)"
!         response = imap.fetch("1:1", self.rfc822_command)
!         if response[0] != "OK":
!             self.rfc822_command = "(RFC822)"
!         
!     def __iter__(self):
!         '''IMAPFolder is iterable'''
!         for key in self.keys():
!             try:
!                 yield self[key]
!             except KeyError:
!                 pass
! 
!     def keys(self):
!         '''Returns uids for all the messages in the folder'''
!         # request message range
!         response = imap.select(self.name, True)
!         total_messages = response[1][0]
!         if total_messages == '0':
!             return []
!         response = imap.fetch("1:" + total_messages, "UID")
!         r = re.compile(r"[0-9]+ \(UID ([0-9]+)\)")
!         uids = []
!         for i in response[1]:
!             mo = r.match(i)
!             if mo is not None:
!                 uids.append(mo.group(1))
!         return uids
! 
!     def __getitem__(self, key):
!         '''Return message matching the given uid'''
!         response = imap.uid("FETCH", key, self.rfc822_command)
!         messageText = response[1][0][1]
!         # we return an instance of *our* message class, not the
!         # raw rfc822 message
!         msg = IMAPMessage(self.uid, self.name, key)
!         msg.setPayload(messageText)
!         return msg
!        
  
  class IMAPFilter(object):
      def __init__(self):
!         global imap
!         imap = imaplib.IMAP4(options.imap_server, options.imap_port)
          if options.verbose:
              print "Loading database...",
***************
*** 54,64 ****
          if options.verbose:
              print "Done."
-         # Unique names for cached messages - see getNewMessageName() below.
-         self.lastBaseMessageName = ''
-         self.uniquifier = 2
- 
-     def Login(self):
-         lgn = self.imap.login(options.imap_username, options.imap_password)
-         self._check(lgn, 'login')
  
      def _check(self, response, command):
--- 145,148 ----
***************
*** 67,139 ****
              sys.exit(-1)
  
-     def _getUIDs(self, low, high):
-         # Retreive a list of uids corresponding to the given range
-         if high < low: return []
-         # request message range
-         range = str(low) + ":" + str(high)
-         res = self.imap.fetch(range, "UID")
-         self._check(res, 'fetch')
-         r = re.compile(r"[0-9]+ \(UID ([0-9]+)\)")
-         res2 = []
-         for i in res[1]:
-             mo = r.match(i)
-             if mo is not None:
-                 res2.append(mo.group(1))
-         return res2
- 
-     def getNewMessageName(self):
-         # The message name is the time it arrived, with a uniquifier
-         # appended if two arrive within one clock tick of each other.
-         # (This is completely taken from the same function in pop3proxy's
-         # State class.)
-         messageName = "%10.10d" % long(time.time())
-         if messageName == self.lastBaseMessageName:
-             messageName = "%s-%d" % (messageName, self.uniquifier)
-             self.uniquifier += 1
-         else:
-             self.lastBaseMessageName = messageName
-             self.uniquifier = 2
-         return messageName
- 
      def _selectFolder(self, name, read_only):
!         folder = self.imap.select(name, read_only)
          self._check(folder, 'select')
          return folder
  
!     def RetrieveMessage(self, uid):
!         response = self.imap.uid("FETCH", uid, "(RFC822.PEEK)")
!         self._check(response, 'uid fetch')
!         try:
!             messageText = response[1][0][1]
!         except:
!             print "Could not retrieve message (id %s)" % uid
!             messageText = ""
! 
!         msg = spambayes.message.Message()
!         msg.setPayload(messageText)
!         msg.setId(uid)
! 
!         msg.delSBHeaders()  # never include sb headers in a train
!                 
!         return msg
  
      def TrainFolder(self, folder_name, isSpam):
!         response = self._selectFolder(folder_name, True)
!         uids = self._getUIDs(1, int(response[1][0]))
!         for uid in uids:
!             msg = self.RetrieveMessage(uid)
! 
              if msg.isTrained():
!                 if isSpam and msg.isTrndHam():
!                     bayes.unlearn(msg.asTokens(), False)  # untrain the ham
!                 elif not isSpam and msg.isTrndSpam():
!                     bayes.unlearn(msg.asTokens(), True)
!                 
!             bayes.learn(msg.asTokens(), isSpam) # train as spam
! 
!             if isSpam:
!                 msg.trndAsSpam()
!             else:
!                 msg.trndAsHam()
  
      def Train(self):
--- 151,175 ----
              sys.exit(-1)
  
      def _selectFolder(self, name, read_only):
!         folder = imap.select(name, read_only)
          self._check(folder, 'select')
          return folder
  
!     def Login(self):
!         lgn = imap.login(options.imap_username, options.imap_password)
!         self._check(lgn, 'login')
  
      def TrainFolder(self, folder_name, isSpam):
!         folder = IMAPFolder(folder_name)
!         for msg in folder:
              if msg.isTrained():
!                 if msg.isTrndAs(isSpam):
!                     # already trained, nothing for us to do here
!                     # (we don't want to train the same message twice)
!                     continue
!                 if msg.isTrained():
!                     self.classifier.unlearn(msg.asTokens(), not isSpam)
!             self.classifier.learn(msg.asTokens(), isSpam)
!             msg.trndAs(isSpam)
  
      def Train(self):
***************
*** 155,169 ****
          if options.verbose:
              t = time.time()
!         inbox = self._selectFolder(options.imap_inbox, False)
!         # the number of messages are returned
!         # get all the corresponding UIDs
!         uids = self._getUIDs(1, int(inbox[1][0]))
!         
!         for uid in uids:
!             msg = self.RetrieveMessage(uid)
!             (prob, clues) = self.classifier.spamprob(msg.asTokens(), evidence=True)
!             msg.addSBHeaders(prob, clues) # adds headers and remembers classification
!             self._updateMessage(msg)
!             self._filterMessage(msg)
          if options.verbose:
              print "Filtering took", time.time() - t, "seconds."
--- 191,204 ----
          if options.verbose:
              t = time.time()
!         for filter_folder in options.imap_filter_folders.split():
!             folder = IMAPFolder(filter_folder, False)
!             for msg in folder:
!                 (prob, clues) = self.classifier.spamprob(msg.asTokens(),
!                                                          evidence=True)
!                 # add headers and remember classification
!                 msg.addSBHeaders(prob, clues)
!                 # XXX updating is disabled for the moment
!                 # msg.Update()
!                 self._filterMessage(msg)
          if options.verbose:
              print "Filtering took", time.time() - t, "seconds."
***************
*** 172,193 ****
          # sign off
          if options.imap_expunge:
!             self.imap.expunge()
!         self.imap.logout()
! 
!     def _updateMessage(self, msg):
!         # we can't actually update the message with IMAP
!         # XXX (someone tell me if this is wrong!)
!         # so what we do is create a new message and delete the old one
!         # we return the new uid, which we obtain by searching for the
!         # spambayes id
!         res = self.imap.append(options.imap_inbox, None,
!                                self._extractTimeFromMessage(msg),
!                                msg.payload())
!         self._check(res, "append")
!         res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)")
!         self._check(res, "uid store")
!         res = self.imap.uid("SEARCH", "(TEXT)", msg.payload())
!         self._check(res, "uid search")
!         return res[1][0]
  
      def _extractTimeFromMessage(self, msg):
--- 207,212 ----
          # sign off
          if options.imap_expunge:
!             imap.expunge()
!         imap.logout()
  
      def _extractTimeFromMessage(self, msg):
***************
*** 198,221 ****
          return imaplib.Time2Internaldate(time.time())
  
!     def _moveMessage(self, msg, dest):
          # The IMAP copy command makes an alias, not a whole new
          # copy, so what we need to do (sigh) is create a new message
          # in the correct folder, and delete the old one
!         # XXX (someone tell me if this is wrong, too!)
!         response = self.imap.uid("FETCH", msg.getId(), "(RFC822.PEEK)")
          self._check(response, 'uid fetch')
! 
!         msg = spambayes.message.Message()
          msg.setPayload(response[1][0][1])
!         msg.setId(_extractTimeFromMessage(msg))
  
!         response = self.imap.append(dest, None, msg.getId(), msg.payload())
          self._check(response, "append")
!         res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)")
          self._check(response, "uid store")
  
!     def _filterMessage(self, msg, prob):
          if msg.isClsfdHam():
              # we leave ham alone
              pass
          elif msg.isClsfdSpam():
--- 217,246 ----
          return imaplib.Time2Internaldate(time.time())
  
!     def _moveMessage(self, old_msg, dest):
          # The IMAP copy command makes an alias, not a whole new
          # copy, so what we need to do (sigh) is create a new message
          # in the correct folder, and delete the old one
!         # XXX (someone tell me if this is wrong)
!         response = imap.uid("FETCH", old_msg.getId(), "(RFC822)")
          self._check(response, 'uid fetch')
!         msg = message.Message()
          msg.setPayload(response[1][0][1])
!         #response = imap.uid("SEARCH", "(TEXT)", msg.get_payload())
!         #self._check(response, "search")
!         #self.changeId(response[1][0])
  
!         response = imap.append(dest, None,
!                                self._extractTimeFromMessage(msg),
!                                msg.get_payload())
          self._check(response, "append")
!         self._selectFolder(old_msg.folder_name, False)
!         response = imap.uid("STORE", old_msg.getId(), "+FLAGS.SILENT",
!                             "(\\Deleted)")
          self._check(response, "uid store")
  
!     def _filterMessage(self, msg):
          if msg.isClsfdHam():
              # we leave ham alone
+             print "untouched"
              pass
          elif msg.isClsfdSpam():
***************
*** 227,230 ****
--- 252,256 ----
      options.verbose = True
      imap_filter = IMAPFilter()
+ #    imap_filter.imap.debug = 10
      imap_filter.Login()
      imap_filter.Train()





More information about the Spambayes-checkins mailing list