[Mailman-Developers] re: Much cpu/memory load

Barry A. Warsaw barry@python.org
Mon Nov 4 17:36:17 2002


Here's a patch that eliminates the MailList object from the article
index pickle.  It also undoes the incomplete hack in MailList for not
pickling the lock object (that should have been a major clue).

I've done some moderate testing of this patch, and I think it's safe,
and probably fixes the majority of the problem.  Please give it a
shot.  It doesn't optimize the loading of the template files -- that
comes next.  But I'll still bet you find a huge speed up.

One outstanding issue: Depending on when you upgraded to 2.1b4 and
when/if you regen'd your archives from scratch, your existing article
indexes may have pickled MailList instances in them.  These should get
cleaned up the next time you archive an article, but only for the
month (or other division) that the article is being added to.  To
clean up your entire archive, you'll need to zap the existing one and
re-run bin/arch.

If that's not acceptable to people, I'll write a little fixer program
to fix *-article indices.

Oh, and you'll find that not including the MailList object in your
article indices can reduce their size by 10x or more.  Nice little
added benefit, eh?

The braver of you can give this a shot, and after a little more
testing, I'll check this into cvs.

-Barry

-------------------- snip snip --------------------
Index: Mailman/MailList.py
===================================================================
RCS file: /cvsroot/mailman/mailman/Mailman/MailList.py,v
retrieving revision 2.94
diff -u -r2.94 MailList.py
--- Mailman/MailList.py	25 Oct 2002 21:10:45 -0000	2.94
+++ Mailman/MailList.py	4 Nov 2002 17:30:22 -0000
@@ -113,23 +113,6 @@
                 if func:
                     func(self)
 
-    # Never pickle our lock object!  We need both of these because MailLists
-    # get pickled by the archiver and this can cause ArchRunner to unlock
-    # lists at the wrong time.  The real fix may be to not pickle MailList
-    # objects in the archiver, but that's too much work to verify at the
-    # moment.
-    LOCKATTR = '_MailList__lock'
-    
-    def __getstate__(self):
-        d = self.__dict__.copy()
-        del d[self.LOCKATTR]
-        return d
-
-    def __setstate__(self, d):
-        if d.has_key(self.LOCKATTR):
-            del d[self.LOCKATTR]
-        self.__dict__ = d
-
     def __getattr__(self, name):
         # Because we're using delegation, we want to be sure that attribute
         # access to a delegated member function gets passed to the
Index: Mailman/Archiver/HyperArch.py
===================================================================
RCS file: /cvsroot/mailman/mailman/Mailman/Archiver/HyperArch.py,v
retrieving revision 2.22
diff -u -r2.22 HyperArch.py
--- Mailman/Archiver/HyperArch.py	19 Oct 2002 20:59:27 -0000	2.22
+++ Mailman/Archiver/HyperArch.py	4 Nov 2002 17:30:23 -0000
@@ -33,11 +33,14 @@
 import types
 import HyperDatabase
 import pipermail
+import weakref
 
 from Mailman import mm_cfg
 from Mailman import Utils
 from Mailman import LockFile
+from Mailman import MailList
 from Mailman import EncWord
+from Mailman import Errors
 from Mailman import i18n
 from Mailman.Logging.Syslog import syslog
 from Mailman.Mailbox import ArchiverMailbox
@@ -234,13 +237,45 @@
         if self.charset and self.charset in mm_cfg.VERBATIM_ENCODING:
             self.quote = unicode_quote
 
+    # Mapping of listnames to MailList instances as a weak value dictionary.
+    _listcache = weakref.WeakValueDictionary()
+
+    def _open_list(self, listname):
+        # Cache the open list so that any use of the list within this process
+        # uses the same object.  We use a WeakValueDictionary so that when the
+        # list is no longer necessary, its memory is freed.
+        mlist = self._listcache.get(listname)
+        if not mlist:
+            try:
+                mlist = MailList.MailList(listname, lock=0)
+            except Errors.MMListError, e:
+                syslog('error', 'error opening list: %s\n%s', listname, e)
+                return None
+            else:
+                self._listcache[listname] = mlist
+        return mlist
+
+    def __getstate__(self):
+        d = self.__dict__.copy()
+        # We definitely don't want to pickle the MailList instance, so just
+        # pickle a reference to it.
+        del d['_mlist']
+        d['__listname'] = self._mlist.internal_name()
+        # Delete a few other things we don't want in the pickle
+        for attr in ('prev', 'next', 'body'):
+            del d[attr]
+        d['body'] = []
+        return d
+
     def __setstate__(self, d):
         # For loading older Articles via pickle.  All this stuff was added
         # when Simone Piunni and Tokio Kikuchi i18n'ified Pipermail.  See SF
         # patch #594771.
         self.__dict__ = d
-        if not d.has_key('_mlist'):
-            self._mlist = None
+        listname = d.get('__listname')
+        if listname:
+            del d['__listname']
+            d['_mlist'] = self._open_list(listname)
         if not d.has_key('_lang'):
             if self._mlist is None:
                 self._lang = mm_cfg.DEFAULT_SERVER_LANGUAGE
@@ -471,18 +506,6 @@
             if line.strip() == '<!--endarticle-->':
                 break
             self.body.append(line)
-
-    def __getstate__(self):
-        d={}
-        for each in self.__dict__.keys():
-            if each == "quote":
-                continue
-            if each in ['maillist','prev','next','body']:
-                d[each] = None
-            else:
-                d[each] = self.__dict__[each]
-        d['body']=[]
-        return d
 
 
 
Index: Mailman/Archiver/HyperDatabase.py
===================================================================
RCS file: /cvsroot/mailman/mailman/Mailman/Archiver/HyperDatabase.py,v
retrieving revision 2.4
diff -u -r2.4 HyperDatabase.py
--- Mailman/Archiver/HyperDatabase.py	16 Mar 2002 06:57:36 -0000	2.4
+++ Mailman/Archiver/HyperDatabase.py	4 Nov 2002 17:30:23 -0000
@@ -19,8 +19,6 @@
 #
 import os
 import marshal
-import string
-import sys
 import time
 import errno
 
@@ -51,11 +49,11 @@
     objects.  The object itself is stored using marshal.  It would be
     much simpler, and probably faster, to store the actual objects in
     the DumbBTree and pickle it.
-    
+
     TBD: Also needs a more sensible name, like IteratableDictionary or
     SortedDictionary.
     """
-    
+
     def __init__(self, path):
         self.current_index = 0
         self.path = path
@@ -65,7 +63,7 @@
         self.dict = {}
         self.sorted = []
         self.load()
-        
+
     def __repr__(self):
         return "DumbBTree(%s)" % self.path
 
@@ -74,7 +72,7 @@
             self.sorted = self.dict.keys()
             self.sorted.sort()
             self.__dirty = 0
-        
+
     def lock(self):
         self.lockfile.lock()
 
@@ -83,28 +81,28 @@
             self.lockfile.unlock()
         except LockFile.NotLockedError:
             pass
-        
+
     def __delitem__(self, item):
         # if first hasn't been called, we can skip the sort
         if self.current_index == 0:
             del self.dict[item]
             self.__dirty = 1
             return
-	try:
-	    ci = self.sorted[self.current_index]
-	except IndexError:
-	    ci = None
-	if ci == item:
-	    try:
-		ci = self.sorted[self.current_index + 1]
-	    except IndexError:
-		ci = None
-	del self.dict[item]
+        try:
+            ci = self.sorted[self.current_index]
+        except IndexError:
+            ci = None
+        if ci == item:
+            try:
+                ci = self.sorted[self.current_index + 1]
+            except IndexError:
+                ci = None
+        del self.dict[item]
         self.__sort(dirty=1)
-	if ci is not None:
-	    self.current_index = self.sorted.index(ci)
-	else:
-	    self.current_index = self.current_index + 1
+        if ci is not None:
+            self.current_index = self.sorted.index(ci)
+        else:
+            self.current_index = self.current_index + 1
 
     def clear(self):
         # bulk clearing much faster than deleting each item, esp. with the
@@ -118,22 +116,22 @@
         else:
             key = self.sorted[0]
             self.current_index = 1
-	    return key, self.dict[key]
+            return key, self.dict[key]
 
     def last(self):
         if not self.sorted:
             raise KeyError
         else:
             key = self.sorted[-1]
-	    self.current_index = len(self.sorted) - 1
+            self.current_index = len(self.sorted) - 1
             return key, self.dict[key]
-	
+
     def next(self):
         try:
             key = self.sorted[self.current_index]
         except IndexError:
             raise KeyError
-	self.current_index = self.current_index + 1
+        self.current_index = self.current_index + 1
         return key, self.dict[key]
 
     def has_key(self, key):
@@ -154,10 +152,10 @@
             self.dict[item] = val
             self.__dirty = 1
             return
-	try:
-	    current_item = self.sorted[self.current_index]
-	except IndexError:
-	    current_item = item
+        try:
+            current_item = self.sorted[self.current_index]
+        except IndexError:
+            current_item = item
         self.dict[item] = val
         self.__sort(dirty=1)
         self.current_index = self.sorted.index(current_item)
@@ -198,48 +196,48 @@
 #
 class HyperDatabase(pipermail.Database):
     __super_addArticle = pipermail.Database.addArticle
-    
+
     def __init__(self, basedir):
         self.__cache = {}
-	self.__currentOpenArchive = None   # The currently open indices
-	self.basedir = os.path.expanduser(basedir)
+        self.__currentOpenArchive = None   # The currently open indices
+        self.basedir = os.path.expanduser(basedir)
         # Recently added articles, indexed only by message ID
         self.changed={}
 
     def firstdate(self, archive):
-	self.__openIndices(archive)
-	date = 'None'
-	try:
-	    datekey, msgid = self.dateIndex.first()
-	    date = time.asctime(time.localtime(string.atof(datekey[0])))
-	except KeyError:
+        self.__openIndices(archive)
+        date = 'None'
+        try:
+            datekey, msgid = self.dateIndex.first()
+            date = time.asctime(time.localtime(float(datekey[0])))
+        except KeyError:
             pass
-	return date
+        return date
 
     def lastdate(self, archive):
-	self.__openIndices(archive)
-	date = 'None'
-	try:
-	    datekey, msgid = self.dateIndex.last()
-	    date = time.asctime(time.localtime(string.atof(datekey[0])))
-	except KeyError:
+        self.__openIndices(archive)
+        date = 'None'
+        try:
+            datekey, msgid = self.dateIndex.last()
+            date = time.asctime(time.localtime(float(datekey[0])))
+        except KeyError:
             pass
-	return date
+        return date
 
     def numArticles(self, archive):
-	self.__openIndices(archive)
-	return len(self.dateIndex)    
+        self.__openIndices(archive)
+        return len(self.dateIndex)
 
     def addArticle(self, archive, article, subject=None, author=None,
                    date=None):
-	self.__openIndices(archive)
+        self.__openIndices(archive)
         self.__super_addArticle(archive, article, subject, author, date)
 
     def __openIndices(self, archive):
-	if self.__currentOpenArchive == archive:
+        if self.__currentOpenArchive == archive:
             return
-	self.__closeIndices()
-	arcdir = os.path.join(self.basedir, 'database')
+        self.__closeIndices()
+        arcdir = os.path.join(self.basedir, 'database')
         omask = os.umask(0)
         try:
             try:
@@ -248,38 +246,38 @@
                 if e.errno <> errno.EEXIST: raise
         finally:
             os.umask(omask)
-	for i in ('date', 'author', 'subject', 'article', 'thread'):
-	    t = DumbBTree(os.path.join(arcdir, archive + '-' + i)) 
-	    setattr(self, i + 'Index', t)
-	self.__currentOpenArchive = archive
+        for i in ('date', 'author', 'subject', 'article', 'thread'):
+            t = DumbBTree(os.path.join(arcdir, archive + '-' + i))
+            setattr(self, i + 'Index', t)
+        self.__currentOpenArchive = archive
 
     def __closeIndices(self):
-	for i in ('date', 'author', 'subject', 'thread', 'article'):
-	    attr = i + 'Index'
-	    if hasattr(self, attr): 
-		index = getattr(self, attr) 
-		if i == 'article': 
-	            if not hasattr(self, 'archive_length'):
+        for i in ('date', 'author', 'subject', 'thread', 'article'):
+            attr = i + 'Index'
+            if hasattr(self, attr):
+                index = getattr(self, attr)
+                if i == 'article':
+                    if not hasattr(self, 'archive_length'):
                         self.archive_length = {}
                     l = len(index)
                     self.archive_length[self.__currentOpenArchive] = l
-		index.close() 
-		delattr(self, attr)
-	self.__currentOpenArchive = None
-        
+                index.close()
+                delattr(self, attr)
+        self.__currentOpenArchive = None
+
     def close(self):
-	self.__closeIndices()
-        
-    def hasArticle(self, archive, msgid): 
-	self.__openIndices(archive)
-	return self.articleIndex.has_key(msgid)
-    
+        self.__closeIndices()
+
+    def hasArticle(self, archive, msgid):
+        self.__openIndices(archive)
+        return self.articleIndex.has_key(msgid)
+
     def setThreadKey(self, archive, key, msgid):
-	self.__openIndices(archive)
-	self.threadIndex[key]=msgid
-        
+        self.__openIndices(archive)
+        self.threadIndex[key]=msgid
+
     def getArticle(self, archive, msgid):
-	self.__openIndices(archive)
+        self.__openIndices(archive)
         if not self.__cache.has_key(msgid):
             # get the pickled object out of the DumbBTree
             buf = self.articleIndex[msgid]
@@ -288,50 +286,50 @@
             article = self.__cache[msgid]
         return article
 
-    def first(self, archive, index): 
-	self.__openIndices(archive)
-	index = getattr(self, index + 'Index')
-	try: 
-	    key, msgid = index.first()
-	    return msgid
-	except KeyError:
+    def first(self, archive, index):
+        self.__openIndices(archive)
+        index = getattr(self, index + 'Index')
+        try:
+            key, msgid = index.first()
+            return msgid
+        except KeyError:
             return None
-        
-    def next(self, archive, index): 
-	self.__openIndices(archive)
-	index = getattr(self, index + 'Index')
-	try: 
-	    key, msgid = index.next()
-	    return msgid
-	except KeyError:
+
+    def next(self, archive, index):
+        self.__openIndices(archive)
+        index = getattr(self, index + 'Index')
+        try:
+            key, msgid = index.next()
+            return msgid
+        except KeyError:
             return None
-	
+
     def getOldestArticle(self, archive, subject):
-	self.__openIndices(archive)
-	subject=string.lower(subject)
-	try: 
-	    key, tempid=self.subjectIndex.set_location(subject)
-	    self.subjectIndex.next()	
-	    [subject2, date]= string.split(key, '\0')
-	    if subject!=subject2: return None
-	    return tempid
-	except KeyError: 
-	    return None
+        self.__openIndices(archive)
+        subject = subject.lower()
+        try:
+            key, tempid=self.subjectIndex.set_location(subject)
+            self.subjectIndex.next()
+            [subject2, date]= key.split('\0')
+            if subject!=subject2: return None
+            return tempid
+        except KeyError:
+            return None
 
     def newArchive(self, archive):
         pass
-    
+
     def clearIndex(self, archive, index):
-	self.__openIndices(archive)
+        self.__openIndices(archive)
         if hasattr(self.threadIndex, 'clear'):
             self.threadIndex.clear()
             return
-	finished=0
-	try:
-	    key, msgid=self.threadIndex.first()	    		
-	except KeyError: finished=1
-	while not finished:
-	    del self.threadIndex[key]
-	    try:
-		key, msgid=self.threadIndex.next()	    		
-	    except KeyError: finished=1
+        finished=0
+        try:
+            key, msgid=self.threadIndex.first()
+        except KeyError: finished=1
+        while not finished:
+            del self.threadIndex[key]
+            try:
+                key, msgid=self.threadIndex.next()
+            except KeyError: finished=1