email parsing

ra9ftm ra9ftm2 at gmail.com
Wed Aug 27 12:45:02 EDT 2008


It is my first script on python. Don't know is it correctly uses
modules, but it is working fine with specially with russian code pages
and mime formated messages. Also quoted-printable and base64
encoded....

It will be very good if anybody post any comments on this script. Is
it good or bad...


import email
import mailbox
from email.Header import decode_header
from email.Header import make_header
import string
import sys

outEnc="cp866"
infile=sys.argv[1]

subStrObrez = []
subStrObrez.append("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
subStrObrez.append("""~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To UNSUBSCRIBE from this forum, send an email to:""")
subStrObrez.append("~~~~~~~~~~~~~~~~~~")

# Cut yahoo info at the end of message
def obrez(strMsg):
    for s in subStrObrez:
        n = string.rfind(strMsg,s)
        if n != -1:
            return strMsg[0:n]
    return strMsg

# Convert message header
def my_get_header(str):
    str2=""
    for val,encoding in decode_header(str):
        if encoding:
            str2 = str2+ val.decode(encoding)+" "
        else:
            str2 = str2+ val+" "
    return str2

# Process the message
def proc(msg):
    print 'From   : '+  my_get_header(msg['From']).encode(outEnc)
    print 'To     : '+  my_get_header(msg['To']).encode(outEnc)
    print 'Subject: '+ my_get_header(msg['Subject']).encode(outEnc)
    print

    if msg.is_multipart():
	for part in msg.walk():
	    if part.get_content_type() == "text/plain":
		if part.get_content_charset():
		    print
obrez(part.get_payload(None,True).decode(part.get_content_charset()).encode(outEnc))
		else:
		    print obrez(part.get_payload(None,True))

    else:
	if msg.get_content_type() == "text/plain":
	    if msg.get_content_charset():
		print
obrez( (msg.get_payload(None,True)).decode(msg.get_content_charset()) ).encode(outEnc)
	    else:
	        print obrez(  msg.get_payload(None,True)  )
	else:
	    if msg.get_content_type() == "text/html":
		if msg.get_content_charset():
		    print
(msg.get_payload(None,True)).decode(msg.get_content_charset()).encode(outEnc)
		else:
		    print msg.get_payload(None,True)


####################################################################################
#  The main program

f = open(infile, "rb")
m1 = mailbox.UnixMailbox(f)

RubLst=[]
RubLst.append(["[contestru]","FOTSTR"])
RubLst.append(["[russiandx]","FORUDX"])

for msg in mailbox.UnixMailbox(f,email.message_from_file):
    for rub in RubLst:
        if string.find(my_get_header(msg['Subject']),rub[0]) != -1 :
	    print "SB "+rub[1]+"@FORUM < INET"
	    print my_get_header(msg['Subject']).encode(outEnc)
    	    print
	    proc(msg)
	    print
	    print "powered by Python"
	    print "/EX"



More information about the Python-list mailing list