parsing incoming emails

Thu Jul 10 16:13:53 EDT 2008

Ahmed wrote...

> I am working on a project where I need to parse incoming emails
> (Microsoft outlook)

I'm not sure if you are able to bypass Outlook (and have Python fetch the
mail itself using poplib), but if you are, the following code might be
useful.  I use this to pry apart emails which might contain multiple MIME parts.

from email.Parser import Parser
from rfc822 import parseaddr
import poplib
import smtplib

popserver="pop.site.com"
popuser="user at site.com"
poppassword="secret"

# split a message into an header- and body part
def separate(msg):
    if isinstance(msg,str):
        msg=msg.split('\n')
    emptyline=msg.index('')
    return msg[:emptyline],msg[emptyline+1:]

# return a certain headerline from the headers
def headerline(header,tag="From: "):
    for h in header:
        if h.startswith(tag):
            return h[len(tag)+1:]
    return ""

# enumerate recursively the contents of a MIME message
# remember the first text/plain and text/html part(s) that is found
# also remember if any other parts were found (like attachments)
#
def enummimeparts(msg,extract,level=1,verbose=False):
    m=Parser().parsestr(msg)
    if m.is_multipart():
        if verbose: print '\t'*level,'multipart'
        for part in m.get_payload():
            enummimeparts(part.as_string(),extract,level+1,verbose)
    else:
        t=m.get_content_type()
        if verbose: print '\t'*level,t
        if t=="text/plain":
            if not "text/plain" in extract:
                headers,body=separate(m.as_string())
                extract["text/plain"]='\n'.join(body)
            else:
                extract["others"]=True
        elif t=="text/html":
            if not "text/html" in extract:
                headers,body=separate(m.as_string())
                extract["text/html"]='\n'.join(body)
            else:
                extract["others"]=True
        else:
            extract["others"]=True

# extract the first 'text/plain' and 'text/html' mime-parts from a message
def extracttext(msg):
    extract={}
    enummimeparts(msg,extract)
    return
extract.get("text/plain",None),extract.get("text/html",None),extract.get("ot
hers",False)

def processmessage(msgnr):
    # get a message from the POP server, extract the parts
    response,lines,bytes=pop.retr(msgnr)
    msg='\n'.join(lines)
    headers,body=separate(lines)
    name,fromaddress=parseaddr(headerline(headers,"From:"))
    subject=headerline(headers,"Subject:")
    logging.info(subject+" ("+fromaddress+")")
    (plain,html,others)=extracttext(msg)
    # prefer flat text; if not present in the message, fallback to HTML
content (if any)
    texttoprocess=""
    if plain:
        texttoprocess=plain
    elif html:
        texttoprocess=html
    # now do something useful with the text
    processtext(texttoprocess)
    # delete message from pop server after processing
    pop.dele(msgnr)

# connect to the pop server and process all messages
logging.info("Checking pop server '%s', user '%s'" % (popserver,popuser))
pop=poplib.POP3(popserver)
pop.user(popuser)
pop.pass_(poppassword)
stat=pop.stat()
if stat[0]:
    for n in range(stat[0]):
        processmessage(n+1)
pop.quit()

-- 
"The ability of the OSS process to collect and harness
the collective IQ of thousands of individuals across
the Internet is simply amazing." - Vinod Vallopillil
http://www.catb.org/~esr/halloween/halloween4.html