[pypy-commit] pypy default: Try to follow exactly the logic of CPython about resizing dicts.

Fri May 11 20:00:46 CEST 2012

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r55047:274128d2747e
Date: 2012-05-11 20:00 +0200
http://bitbucket.org/pypy/pypy/changeset/274128d2747e/

Log:	Try to follow exactly the logic of CPython about resizing dicts. In
	particular it means that small and medium dictionaries are now
	quadrupled in size instead of just doubled. This might give a
	performance boost in code that populates a lot of dictionaries. To
	be seen.

diff --git a/pypy/rpython/lltypesystem/rdict.py b/pypy/rpython/lltypesystem/rdict.py
--- a/pypy/rpython/lltypesystem/rdict.py
+++ b/pypy/rpython/lltypesystem/rdict.py
@@ -502,8 +502,6 @@
         raise KeyError
     _ll_dict_del(d, i)
 
-# XXX: Move the size checking and resize into a single call which is opauqe to
-# the JIT when the dict isn't virtual, to avoid extra branches.
 @jit.look_inside_iff(lambda d, i: jit.isvirtual(d) and jit.isconstant(i))
 def _ll_dict_del(d, i):
     d.entries.mark_deleted(i)
@@ -516,18 +514,32 @@
         entry.key = lltype.nullptr(ENTRY.key.TO)
     if ENTRIES.must_clear_value:
         entry.value = lltype.nullptr(ENTRY.value.TO)
-    num_entries = len(d.entries)
-    if num_entries > DICT_INITSIZE and d.num_items < num_entries / 4:
-        ll_dict_resize(d)
+    #
+    # The rest is commented out: like CPython we no longer shrink the
+    # dictionary here.  It may shrink later if we try to append a number
+    # of new items to it.  Unsure if this behavior was designed in
+    # CPython or is accidental.  A design reason would be that if you
+    # delete all items in a dictionary (e.g. with a series of
+    # popitem()), then CPython avoids shrinking the table several times.
+    #num_entries = len(d.entries)
+    #if num_entries > DICT_INITSIZE and d.num_items <= num_entries / 4:
+    #    ll_dict_resize(d)
+    # A previous xxx: move the size checking and resize into a single
+    # call which is opaque to the JIT when the dict isn't virtual, to
+    # avoid extra branches.
 
 def ll_dict_resize(d):
     old_entries = d.entries
     old_size = len(old_entries)
     # make a 'new_size' estimate and shrink it if there are many
-    # deleted entry markers
-    new_size = old_size * 2
-    while new_size > DICT_INITSIZE and d.num_items < new_size / 4:
-        new_size /= 2
+    # deleted entry markers.  See CPython for why it is a good idea to
+    # quadruple the dictionary size as long as it's not too big.
+    if d.num_items > 50000: new_estimate = d.num_items * 2
+    else:                   new_estimate = d.num_items * 4
+    new_size = DICT_INITSIZE
+    while new_size <= new_estimate:
+        new_size *= 2
+    #
     d.entries = lltype.typeOf(old_entries).TO.allocate(new_size)
     d.num_items = 0
     d.resize_counter = new_size * 2
diff --git a/pypy/rpython/test/test_rdict.py b/pypy/rpython/test/test_rdict.py
--- a/pypy/rpython/test/test_rdict.py
+++ b/pypy/rpython/test/test_rdict.py
@@ -749,13 +749,20 @@
         assert count_frees >= 3
 
     def test_dict_resize(self):
+        # XXX we no longer automatically resize on 'del'.  We need to
+        # hack a bit in this test to trigger a resize by continuing to
+        # fill the dict's table while keeping the actual size very low
+        # in order to force a resize to shrink the table back
         def func(want_empty):
             d = {}
-            for i in range(rdict.DICT_INITSIZE):
+            for i in range(rdict.DICT_INITSIZE << 1):
                 d[chr(ord('a') + i)] = i
             if want_empty:
-                for i in range(rdict.DICT_INITSIZE):
+                for i in range(rdict.DICT_INITSIZE << 1):
                     del d[chr(ord('a') + i)]
+                for i in range(rdict.DICT_INITSIZE << 3):
+                    d[chr(ord('A') - i)] = i
+                    del d[chr(ord('A') - i)]
             return d
         res = self.interpret(func, [0])
         assert len(res.entries) > rdict.DICT_INITSIZE