[Spambayes-checkins] spambayes tokenizer.py,1.28,1.29
Tim Peters
tim_one@users.sourceforge.net
Thu, 19 Sep 2002 23:18:26 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv29244
Modified Files:
tokenizer.py
Log Message:
get_message(): Added docstring. Reduced useless nesting. Moved
inappropriate code out of a try block. In case of a message parse
error, used a cheap trick to try to get rid of the (probably malformed)
headers before wrapping the text in a bare Message object.
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.28
retrieving revision 1.29
diff -C2 -d -r1.28 -r1.29
*** tokenizer.py 20 Sep 2002 06:06:13 -0000 1.28
--- tokenizer.py 20 Sep 2002 06:18:24 -0000 1.29
***************
*** 832,848 ****
def get_message(self, obj):
if isinstance(obj, email.Message.Message):
return obj
! else:
! # Create an email Message object.
! try:
! if hasattr(obj, "read"):
! obj = obj.read()
! return email.message_from_string(obj)
! except email.Errors.MessageParseError:
! # XXX: This puts the headers in the payload...
! msg = email.Message.Message()
! msg.set_payload(obj)
! return msg
def tokenize(self, obj):
--- 832,864 ----
def get_message(self, obj):
+ """Return an email Message object.
+
+ The argument may be a Message object already, in which case it's
+ returned as-is.
+
+ If the argument is a string or file-like object (supports read()),
+ the email package is used to create a Message object from it. This
+ can fail if the message is malformed. In that case, the headers
+ (everything through the first blank line) are thrown out, and the
+ rest of the text is wrapped in a bare email.Message.Message.
+ """
+
if isinstance(obj, email.Message.Message):
return obj
! # Create an email Message object.
! if hasattr(obj, "read"):
! obj = obj.read()
! try:
! msg = email.message_from_string(obj)
! except email.Errors.MessageParseError:
! # Wrap the raw text in a bare Message object. Since the
! # headers are most likely damaged, we can't use the email
! # package to parse them, so just get rid of them first.
! i = obj.find('\n\n')
! if i >= 0:
! obj = obj[i+2:] # strip headers
! msg = email.Message.Message()
! msg.set_payload(obj)
! return msg
def tokenize(self, obj):