Efficient scanning of mbox files
Paul Moore
gustav at morpheus.demon.co.uk
Mon Nov 11 16:40:05 EST 2002
Martin Franklin <mfranklin1 at gatwick.westerngeco.slb.com> writes:
> I ran the above example on my Python folder (7000+ messages...)
> it took 12 seconds to process. Then I changed the
> if FROM_RE.match(line):
>
> to
>
> if line.startswith("From "):
>
>
> And got a 2 second speed up....
>
> Then I slurped the file into a cStringIO.StringIO object and got it down
> to 5 seconds.....
OK, I did some tests, based on your suggestions, on a 90MB file with
30,000 messages in (the full python-dev list archive).
Reading the file line by line, as I started with: 61 seconds
Read the whole lot into memory: 30 seconds
This is a significant speedup, and well worth the savings. The problem
is memory, but I suddenly thought of the mmap module. Using the
in-memory code on a mmapped file took 28 seconds, not enough of an
improvement in time to be significant, but presumably the memory usage
is much better. (Does anyone know a way of doing memory usage analysis
of Python code? I'm guessing that I'd have to do separate runs to see
the effects - running multiple tests in a loop ain't going to work)
I can't do tests on this big sample including the cost of creating
email.Message objects, as I got an error (which may be a malformed
message, or a problem in the code - no time to check right now). But
on a smaller test case, using email at least doubled the time. Maybe
I'll
FWIW, here's my test harness:
import email, email.Parser
import re
# Line starting with "from " ending with 4 digits (a year)
FROM_RE = re.compile(r"^from .*\d\d\d\d$", re.I+re.M)
P = email.Parser.Parser()
class Tester:
def __init__(self):
self.ids = {}
self.groups = {}
def test1(self, id, file):
"Simple read through file with RE match"
fp = open(file, "rb")
posns = []
oldpos = 0
n = 0
while 1:
line = fp.readline()
if not line: break
if FROM_RE.match(line):
n += 1
posns.append(oldpos)
oldpos = fp.tell()
fp.close()
posns.append(oldpos)
self.groups[id] = (file, n, posns)
def test2(self, id, file):
"Slurp and scan with RE"
fp = open(file, "rb")
data = fp.read()
fp.close()
posns = [match.start() for match in re.finditer(FROM_RE, data)]
n = len(posns)
posns.append(len(data))
self.groups[id] = (file, n, posns)
def test3(self, id, file):
"Mmap and scan with RE"
import os, mmap
fd = os.open(file, os.O_RDWR+os.O_BINARY)
data = mmap.mmap(fd, 0)
posns = [match.start() for match in re.finditer(FROM_RE, data)]
n = len(posns)
posns.append(len(data))
data.close()
os.close(fd)
self.groups[id] = (file, n, posns)
def test1e(self, id, file):
"Simple read through file with RE match (create email object)"
fp = open(file, "rb")
posns = []
oldpos = 0
n = 0
text = ""
while 1:
line = fp.readline()
if not line: break
if FROM_RE.match(line):
msg = email.message_from_string(text)
n += 1
posns.append(oldpos)
text = ""
oldpos = fp.tell()
text += line
fp.close()
msg = email.message_from_string(text)
posns.append(oldpos)
self.groups[id] = (file, n, posns)
def test2e(self, id, file):
"Slurp and scan with RE (create email object)"
fp = open(file, "rb")
data = fp.read()
fp.close()
posns = [match.start() for match in re.finditer(FROM_RE, data)]
n = len(posns)
posns.append(len(data))
for i in range(n):
start, end = posns[i:i+2]
msg = email.message_from_string(data[start:end])
self.groups[id] = (file, n, posns)
def test3e(self, id, file):
"Mmap and scan with RE (create email object)"
import os, mmap
fd = os.open(file, os.O_RDWR+os.O_BINARY)
data = mmap.mmap(fd, 0)
posns = [match.start() for match in re.finditer(FROM_RE, data)]
n = len(posns)
posns.append(len(data))
for i in range(n):
start, end = posns[i:i+2]
msg = email.message_from_string(data[start:end])
self.groups[id] = (file, n, posns)
data.close()
os.close(fd)
def test1h(self, id, file):
"Simple read through file with RE match (create email object - headers)"
fp = open(file, "rb")
posns = []
oldpos = 0
n = 0
text = ""
while 1:
line = fp.readline()
if not line: break
if FROM_RE.match(line):
msg = P.parsestr(text,1)
n += 1
posns.append(oldpos)
text = ""
oldpos = fp.tell()
text += line
fp.close()
msg = P.parsestr(text,1)
posns.append(oldpos)
self.groups[id] = (file, n, posns)
def test2h(self, id, file):
"Slurp and scan with RE (create email object - headers)"
fp = open(file, "rb")
data = fp.read()
fp.close()
posns = [match.start() for match in re.finditer(FROM_RE, data)]
n = len(posns)
posns.append(len(data))
for i in range(n):
start, end = posns[i:i+2]
msg = P.parsestr(data[start:end],1)
self.groups[id] = (file, n, posns)
def test3h(self, id, file):
"Mmap and scan with RE (create email object - headers)"
import os, mmap
fd = os.open(file, os.O_RDWR+os.O_BINARY)
data = mmap.mmap(fd, 0)
posns = [match.start() for match in re.finditer(FROM_RE, data)]
n = len(posns)
posns.append(len(data))
for i in range(n):
start, end = posns[i:i+2]
msg = P.parsestr(data[start:end],1)
self.groups[id] = (file, n, posns)
data.close()
os.close(fd)
test = Tester()
methods = [name for name in dir(test) if name.startswith("test")]
from time import clock
for name in methods:
meth = getattr(test, name)
start = clock()
meth(name, "2002-April.txt")
# meth(name, "python-dev.mbox")
end = clock()
print meth.__doc__
print "Elapsed time for", test.groups[name][1], "messages =", end-start
# Test we didn't mess up the file locations...
posns = test.groups[methods[0]][2]
for name in methods[1:]:
others = test.groups[name][2]
if others != posns:
print name, "differs"
And here are the results on a 9M mailbox (python-list for April 2002):
>mbox.py
Simple read through file with RE match
Elapsed time for 5970 messages = 6.39456009226
Simple read through file with RE match (create email object)
Elapsed time for 5970 messages = 12.6123382686
Simple read through file with RE match (create email object - headers)
Elapsed time for 5970 messages = 10.0656287138
Slurp and scan with RE
Elapsed time for 5970 messages = 2.32596033128
Slurp and scan with RE (create email object)
Elapsed time for 5970 messages = 7.06059008601
Slurp and scan with RE (create email object - headers)
Elapsed time for 5970 messages = 4.6319295799
Mmap and scan with RE
Elapsed time for 5970 messages = 2.12352013356
Mmap and scan with RE (create email object)
Elapsed time for 5970 messages = 6.85222707014
Mmap and scan with RE (create email object - headers)
Elapsed time for 5970 messages = 4.43201372465
I've got some pretty good speedups here. Mmap and scan getting email
headers is faster than a line-by-line scan with no parsing :-)
Thanks for the help.
Paul.
--
This signature intentionally left blank
More information about the Python-list
mailing list