[Python-checkins] python/nondist/sandbox/spambayes timtest.py,1.2,1.3

tim_one@users.sourceforge.net tim_one@users.sourceforge.net
Sat, 31 Aug 2002 11:56:30 -0700


Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv31583

Modified Files:
	timtest.py 
Log Message:
textparts():  This was failing to weed out redundant HTML in cases like
this:

    multipart/alternative
        text/plain
        multipart/related
            text/html

The tokenizer here also transforms everything to lowercase, but that's
an accident due simply to that I'm testing that now.  Can't say for
sure until the test runs end, but so far it looks like a bad idea for
the false positive rate.


Index: timtest.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/timtest.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** timtest.py	31 Aug 2002 17:25:10 -0000	1.2
--- timtest.py	31 Aug 2002 18:56:28 -0000	1.3
***************
*** 15,18 ****
--- 15,23 ----
  import classifier
  
+ # Find all the text components of the msg.  There's no point decoding
+ # binary blobs (like images).  If a multipart/alternative has both plain
+ # text and HTML versions of a msg, ignore the HTML part:  HTML decorations
+ # have monster-high spam probabilities, and innocent newbies often post
+ # using HTML.
  def textparts(msg):
      text = Set()
***************
*** 20,25 ****
      for part in msg.walk():
          if part.get_content_type() == 'multipart/alternative':
!             textpart = htmlpart = None
!             for subpart in part.get_payload():
                  ctype = subpart.get_content_type()
                  if ctype == 'text/plain':
--- 25,34 ----
      for part in msg.walk():
          if part.get_content_type() == 'multipart/alternative':
!             # Descend this part of the tree, adding any redundant HTML text
!             # part to redundant_html.
!             htmlpart = textpart = None
!             stack = part.get_payload()
!             while stack:
!                 subpart = stack.pop()
                  ctype = subpart.get_content_type()
                  if ctype == 'text/plain':
***************
*** 27,37 ****
                  elif ctype == 'text/html':
                      htmlpart = subpart
  
!             if textpart is not None:
!                 text.add(textpart)
!                 if htmlpart is not None:
!                     redundant_html.add(htmlpart)
!             elif htmlpart is not None:
!                 text.add(htmlpart)
  
          elif part.get_content_maintype() == 'text':
--- 36,46 ----
                  elif ctype == 'text/html':
                      htmlpart = subpart
+                 elif ctype == 'multipart/related':
+                     stack.extend(subpart.get_payload())
  
!             if textpart is not None and htmlpart is not None:
!                 redundant_html.add(htmlpart)
!             # If only textpart was found, the main walk() will eventually
!             # add it to text.
  
          elif part.get_content_maintype() == 'text':
***************
*** 60,64 ****
          yield 'control: MessageParseError'
          if nohead is not None:
!             for w in nohead.split():
                  if 3 <= len(w) <= 12:
                      yield w
--- 69,73 ----
          yield 'control: MessageParseError'
          if nohead is not None:
!             for w in nohead.lower().split():
                  if 3 <= len(w) <= 12:
                      yield w
***************
*** 74,78 ****
                  yield 'control: payload is None'
              else:
!                 for w in text.split():
                      if 3 <= len(w) <= 12:
                          yield w
--- 83,87 ----
                  yield 'control: payload is None'
              else:
!                 for w in text.lower().split():
                      if 3 <= len(w) <= 12:
                          yield w