Efficient scanning of mbox files

Mon Nov 11 16:40:05 EST 2002

Martin Franklin <mfranklin1 at gatwick.westerngeco.slb.com> writes:

> I ran the above example on my Python folder (7000+ messages...)
> it took 12 seconds to process.  Then I changed the 
> if FROM_RE.match(line):
>
> to
>
> if line.startswith("From "):
>
>
> And got a 2 second speed up....  
>
> Then I slurped the file into a cStringIO.StringIO object and got it down
> to 5 seconds.....

OK, I did some tests, based on your suggestions, on a 90MB file with
30,000 messages in (the full python-dev list archive).

Reading the file line by line, as I started with: 61 seconds
Read the whole lot into memory: 30 seconds

This is a significant speedup, and well worth the savings. The problem
is memory, but I suddenly thought of the mmap module. Using the
in-memory code on a mmapped file took 28 seconds, not enough of an
improvement in time to be significant, but presumably the memory usage
is much better. (Does anyone know a way of doing memory usage analysis
of Python code? I'm guessing that I'd have to do separate runs to see
the effects - running multiple tests in a loop ain't going to work)

I can't do tests on this big sample including the cost of creating
email.Message objects, as I got an error (which may be a malformed
message, or a problem in the code - no time to check right now). But
on a smaller test case, using email at least doubled the time. Maybe
I'll 

FWIW, here's my test harness:

import email, email.Parser
import re

# Line starting with "from " ending with 4 digits (a year)
FROM_RE = re.compile(r"^from .*\d\d\d\d$", re.I+re.M)

P = email.Parser.Parser()

class Tester:
    def __init__(self):
        self.ids = {}
        self.groups = {}

    def test1(self, id, file):
        "Simple read through file with RE match"
        fp = open(file, "rb")
        posns = []
        oldpos = 0
        n = 0
        while 1:
            line = fp.readline()
            if not line: break
            if FROM_RE.match(line):
                n += 1
                posns.append(oldpos)
            oldpos = fp.tell()
        fp.close()
        posns.append(oldpos)
        self.groups[id] = (file, n, posns)

    def test2(self, id, file):
        "Slurp and scan with RE"
        fp = open(file, "rb")
        data = fp.read()
        fp.close()
        posns = [match.start() for match in re.finditer(FROM_RE, data)]
        n = len(posns)
        posns.append(len(data))
        self.groups[id] = (file, n, posns)

    def test3(self, id, file):
        "Mmap and scan with RE"
        import os, mmap
        fd = os.open(file, os.O_RDWR+os.O_BINARY)
        data = mmap.mmap(fd, 0)
        posns = [match.start() for match in re.finditer(FROM_RE, data)]
        n = len(posns)
        posns.append(len(data))
        data.close()
        os.close(fd)
        self.groups[id] = (file, n, posns)

    def test1e(self, id, file):
        "Simple read through file with RE match (create email object)"
        fp = open(file, "rb")
        posns = []
        oldpos = 0
        n = 0
        text = ""
        while 1:
            line = fp.readline()
            if not line: break
            if FROM_RE.match(line):
                msg = email.message_from_string(text)
                n += 1
                posns.append(oldpos)
                text = ""
            oldpos = fp.tell()
            text += line
        fp.close()
        msg = email.message_from_string(text)
        posns.append(oldpos)
        self.groups[id] = (file, n, posns)

    def test2e(self, id, file):
        "Slurp and scan with RE (create email object)"
        fp = open(file, "rb")
        data = fp.read()
        fp.close()
        posns = [match.start() for match in re.finditer(FROM_RE, data)]
        n = len(posns)
        posns.append(len(data))
        for i in range(n):
            start, end = posns[i:i+2]
            msg = email.message_from_string(data[start:end])
        self.groups[id] = (file, n, posns)

    def test3e(self, id, file):
        "Mmap and scan with RE (create email object)"
        import os, mmap
        fd = os.open(file, os.O_RDWR+os.O_BINARY)
        data = mmap.mmap(fd, 0)
        posns = [match.start() for match in re.finditer(FROM_RE, data)]
        n = len(posns)
        posns.append(len(data))
        for i in range(n):
            start, end = posns[i:i+2]
            msg = email.message_from_string(data[start:end])
        self.groups[id] = (file, n, posns)
        data.close()
        os.close(fd)

    def test1h(self, id, file):
        "Simple read through file with RE match (create email object - headers)"
        fp = open(file, "rb")
        posns = []
        oldpos = 0
        n = 0
        text = ""
        while 1:
            line = fp.readline()
            if not line: break
            if FROM_RE.match(line):
                msg = P.parsestr(text,1)
                n += 1
                posns.append(oldpos)
                text = ""
            oldpos = fp.tell()
            text += line
        fp.close()
        msg = P.parsestr(text,1)
        posns.append(oldpos)
        self.groups[id] = (file, n, posns)

    def test2h(self, id, file):
        "Slurp and scan with RE (create email object - headers)"
        fp = open(file, "rb")
        data = fp.read()
        fp.close()
        posns = [match.start() for match in re.finditer(FROM_RE, data)]
        n = len(posns)
        posns.append(len(data))
        for i in range(n):
            start, end = posns[i:i+2]
            msg = P.parsestr(data[start:end],1)
        self.groups[id] = (file, n, posns)

    def test3h(self, id, file):
        "Mmap and scan with RE (create email object - headers)"
        import os, mmap
        fd = os.open(file, os.O_RDWR+os.O_BINARY)
        data = mmap.mmap(fd, 0)
        posns = [match.start() for match in re.finditer(FROM_RE, data)]
        n = len(posns)
        posns.append(len(data))
        for i in range(n):
            start, end = posns[i:i+2]
            msg = P.parsestr(data[start:end],1)
        self.groups[id] = (file, n, posns)
        data.close()
        os.close(fd)

test = Tester()

methods = [name for name in dir(test) if name.startswith("test")]

from time import clock

for name in methods:
    meth = getattr(test, name)
    start = clock()
    meth(name, "2002-April.txt")
    # meth(name, "python-dev.mbox")
    end = clock()
    print meth.__doc__
    print "Elapsed time for", test.groups[name][1], "messages =", end-start

# Test we didn't mess up the file locations...
posns = test.groups[methods[0]][2]
for name in methods[1:]:
    others = test.groups[name][2]
    if others != posns:
        print name, "differs"

And here are the results on a 9M mailbox (python-list for April 2002):

>mbox.py
Simple read through file with RE match
Elapsed time for 5970 messages = 6.39456009226
Simple read through file with RE match (create email object)
Elapsed time for 5970 messages = 12.6123382686
Simple read through file with RE match (create email object - headers)
Elapsed time for 5970 messages = 10.0656287138
Slurp and scan with RE
Elapsed time for 5970 messages = 2.32596033128
Slurp and scan with RE (create email object)
Elapsed time for 5970 messages = 7.06059008601
Slurp and scan with RE (create email object - headers)
Elapsed time for 5970 messages = 4.6319295799
Mmap and scan with RE
Elapsed time for 5970 messages = 2.12352013356
Mmap and scan with RE (create email object)
Elapsed time for 5970 messages = 6.85222707014
Mmap and scan with RE (create email object - headers)
Elapsed time for 5970 messages = 4.43201372465

I've got some pretty good speedups here. Mmap and scan getting email
headers is faster than a line-by-line scan with no parsing :-)

Thanks for the help.
Paul.

-- 
This signature intentionally left blank