[Spambayes] Re: Mailbox class in the spambayes project & python2.2.1

Greg Ward gward@python.net
Thu, 26 Sep 2002 13:54:22 -0400


---------------------- multipart/mixed attachment
On 26 September 2002, Alexander Leidinger said:
> No, cleanarch damaged the files in the first run. After cleaning them up
> (removing the '>' in front of every line which begins with ">From " and
> has a date at the end) I now have to find those lines, which need the
> '>' now. I already found some in mboxes with upto 90 messages (all of
> them had at least one unparseable message) and fixed it.

The mbox format sucks.  All tools that parse mbox suck; Python's
mailbox.py, however, sucks slightly more than most.  formail is a tool
bundled with procmail; it does a pretty good job of splitting up an mbox
file.  The best use for it is to convert that mbox to a Maildir.  I'll
attach my scripts for converting an mbox to a Maildir.

(BTW, looking at the code, it looks like the attached addtomaildir
script does *not* convert the date in "From " to a Delivery-date header.
It probably should.)

(Oh, Alexander, I think you'll have problems because of the weird 
"From " lines that you showed in your mail.  You'll need to tweak the
regex used to parse "From " lines in addtomaildir if you want to use
it on that mbox file.  Good luck!)

        Greg
-- 
Greg Ward <gward@python.net>                         http://www.gerg.ca/
Eschew obfuscation!

---------------------- multipart/mixed attachment
#!/bin/sh

# Convert an mbox mail file to a maildir.
# Requires formail (from procmail) and my addtomaildir script.
# Idea stolen from
#   http://www.nb.net/~lbudney/linux/software/safecat/one-liners.html
#
# by Greg Ward, 2002/01/22
#
# Usage:
#   mb2md mbox maildir
# where mbox must exist, and maildir must not exist. 

if [ "$#" -ne 2 ] ; then
  echo "usage: $0 mbox maildir" >&2
  exit 1
fi

mbox=$1
maildir=$2

if [ -e "$maildir" ] ; then
  echo "error: $maildir already exists" >&2
  exit 1
fi

if [ ! -e "$mbox" ] ; then
  echo "error: $mbox does not exist" >&2
  exit 1
fi

mkdir -p $maildir/{cur,new,tmp}
formail -s addtomaildir $maildir < $mbox

---------------------- multipart/mixed attachment
#!/usr/bin/env python

"""addtomaildir

Reads an RFC 822 message (possibly with leading "From " line) on stdin
and adds it to a Maildir.  The exact details of where it lands and what
it's called in the Maildir depend on various header values in the input
message:

  * if no "Status" header, the message goes in "new", otherwise in "cur"

  * if "Status" is "O" (old), the filename has no info field

  * if "Status" is "RO" (read old), the filename has ":2,S" appended
    as its info field

  * the mtime of the file will be the delivery time of the message,
    if we can figure out the delivery time.  Tries the "Delivery-date"
    header first, then the "From " line; if neither exists or can
    be parsed, leaves the mtime alone.
"""

import sys, os, re
import socket, errno
from time import time, mktime, strptime, ctime, sleep
from rfc822 import Message, parsedate_tz, mktime_tz



class Error (Exception):
    pass


def warn (msg):
    sys.stderr.write("warning: %s\n" % msg)


def maildir_open (maildir):
    # Assumes we're already chdir'd into maildir

    hostname = socket.gethostname()
    pid = os.getpid()
    num_tries = 0
    max_tries = 5
    while 1:
        name = "tmp/%.6f%05d.%s" % (time(), pid, hostname)
        ok = 0                          # assume the worst
        num_tries += 1
        try:
            os.stat(name)
        except OSError, err:
            # Good: file called 'name' doesn't already exist.
            if err.errno == errno.ENOENT:
                ok = 1

        if ok:
            break
        else:
            if num_tries > max_tries:
                raise Error("error: could not create temporary file in %s/tmp"
                            % maildir)
            sleep(2)               # and try again

    fd = os.open(name, os.O_WRONLY|os.O_EXCL|os.O_CREAT, 0600)
    return (name, fd)


def grok_status (msg):
    # Figure out if this is a new message, an "old" message
    # (seen by MUA, but not read by user), or a read message.
    status = msg.get("Status")
    if status == "O":                   # seen by MUA, but not read by user
        dir = "cur"
        info = ""
    elif status == "RO":                # read by user
        dir = "cur"
        info = ":2,S"
    else:                               # not there, empty, or unknown value
        dir = "new"
        info = ""

    return (dir, info)

def get_delivery_time (msg):
    # Figure out the delivery time.
    dtime = None
    if msg.has_key("Delivery-date"):
        # eg. "Thu, 12 Jul 2001 08:47:20 -0400" to 994942040 (seconds
        # since epoch in UTC)
        dtime = mktime_tz(parsedate_tz(msg["Delivery-date"]))
    elif msg.unixfrom:
        # Parse eg.
        #   "From python-dev-admin@python.org Thu Jul 12 08:47:20 2001"
        # -- this is the "From " line format used by Exim; hopefully other
        # MTAs do the same!
        m = re.match(r'^From (\S+) (\w{3} \w{3}\s+\d\d? \d\d:\d\d:\d\d \d{4})$',
                     msg.unixfrom)
        if not m:
            warn("warning: could not parse \"From \" line: %s" % msg.unixfrom)
        else:
            (return_path, dtime_str) = m.groups()
            # Eg. "Thu Jul 12 08:47:20 2001" -> 994945640 -- note that
            # this might be different from what we get parsing the same
            # date string above, because this one doesn't include the
            # timezone.  Sigh.
            dtime = mktime(strptime(dtime_str, "%c"))

            # Attempt to detect and correct for DST differences.
            # (This works if we parsed a summer time during the winter;
            # what about the inverse?)
            dtime_str_curtz = ctime(dtime)
            if dtime_str_curtz != dtime_str:
                dtime_curtz = mktime(strptime(dtime_str_curtz, "%c"))
                diff = dtime_curtz - dtime
                dtime -= diff

    return dtime

def write_message (msg, msg_file, out_fd):
    # Write the headers to the temp file.
    headers = str(msg) + "\n"
    n = os.write(out_fd, headers)
    if n != len(headers):
        raise Error("failed to write headers (%d/%d bytes written)"
                    % (n, len(headers)))
    
    # Copy the body from msg_file to the temp file.
    chunk = 16*1024
    while 1:
        data = msg_file.read(chunk)
        if not data:
            break
        n = os.write(out_fd, data)
        if n != len(data):
            raise Error("failed to write chunk of body (%d/%d bytes written)"
                        % (n, len(data)))

    # Sync and close the temp file.
    try:
        os.fsync(out_fd)
        os.close(out_fd)
    except OSError, err:
        os.unlink(tmp_name)
        raise Error("unable to fsync() or close() temp file: %s" % err)


def finish_message (tmp_name, dir, info, dtime):

    # Link the temp file to its ultimate destination (in either "new" or
    # "cur", with info appended to the name), and remove the temp name.
    base_name = os.path.basename(tmp_name)
    dst_name = os.path.join(dir, base_name + info)
    os.link(tmp_name, dst_name)

    # Set the modification time to the delivery time, if known.
    if dtime is not None:
        atime = os.stat(dst_name).st_atime
        os.utime(dst_name, (atime, dtime))

    return dst_name

def add (msg_file, maildir):
    # First reserve a place in the maildir (ie. open the file in tmp).
    start_dir = os.getcwd()
    os.chdir(maildir)
    (tmp_name, out_fd) = maildir_open(maildir)

    try:
        msg = Message(msg_file)
        (dir, info) = grok_status(msg)
        dtime = get_delivery_time(msg)
        write_message(msg, msg_file, out_fd)
        dst_name = finish_message(tmp_name, dir, info, dtime)
    finally:
        os.unlink(tmp_name)
        os.chdir(start_dir)

    print dst_name

# add ()


def main ():
    prog = os.path.basename(sys.argv[0])
    args = sys.argv[1:]
    if len(args) == 1:
        maildir = args[0]
        msg_file = sys.stdin
    elif len(args) == 2:
        (msg_filename, maildir) = args
        msg_file = open(msg_filename)
    else:
        sys.exit("usage: %s maildir\n"
                 "       %s msg_file maildir\n"
                 "\n"
                 "error: incorrect number of arguments\n")

    if not (os.path.isdir(maildir) and
            os.path.isdir(os.path.join(maildir, "tmp")) and
            os.path.isdir(os.path.join(maildir, "cur")) and
            os.path.isdir(os.path.join(maildir, "new"))):
        sys.exit("error: not a maildir: %s" % maildir)

    try:
        add(msg_file, maildir)
    except Error, err:
        sys.exit(str(err))


main()

---------------------- multipart/mixed attachment--