[Python-checkins] cpython (2.7): Issue #16098: Update heapq.nsmallest to use the same algorithm as nlargest.

raymond.hettinger python-checkins at python.org
Tue Mar 5 08:21:08 CET 2013


http://hg.python.org/cpython/rev/aa9ed98a49cb
changeset:   82488:aa9ed98a49cb
branch:      2.7
parent:      82430:6ae4938256c6
user:        Raymond Hettinger <python at rcn.com>
date:        Tue Mar 05 02:15:01 2013 -0500
summary:
  Issue #16098:  Update heapq.nsmallest to use the same algorithm as nlargest.

This removes the dependency on bisect and it bring the pure Python code
in-sync with the C code.

files:
  Lib/heapq.py |  84 ++++++++++++++++++++++++++++-----------
  1 files changed, 59 insertions(+), 25 deletions(-)


diff --git a/Lib/heapq.py b/Lib/heapq.py
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -129,9 +129,8 @@
 __all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge',
            'nlargest', 'nsmallest', 'heappushpop']
 
-from itertools import islice, repeat, count, imap, izip, tee, chain
+from itertools import islice, count, imap, izip, tee, chain
 from operator import itemgetter
-import bisect
 
 def cmp_lt(x, y):
     # Use __lt__ if available; otherwise, try __le__.
@@ -188,6 +187,19 @@
     for i in reversed(xrange(n//2)):
         _siftup(x, i)
 
+def _heappushpop_max(heap, item):
+    """Maxheap version of a heappush followed by a heappop."""
+    if heap and cmp_lt(item, heap[0]):
+        item, heap[0] = heap[0], item
+        _siftup_max(heap, 0)
+    return item
+
+def _heapify_max(x):
+    """Transform list into a maxheap, in-place, in O(len(x)) time."""
+    n = len(x)
+    for i in reversed(range(n//2)):
+        _siftup_max(x, i)
+
 def nlargest(n, iterable):
     """Find the n largest elements in a dataset.
 
@@ -213,30 +225,16 @@
     """
     if n < 0:
         return []
-    if hasattr(iterable, '__len__') and n * 10 <= len(iterable):
-        # For smaller values of n, the bisect method is faster than a minheap.
-        # It is also memory efficient, consuming only n elements of space.
-        it = iter(iterable)
-        result = sorted(islice(it, 0, n))
-        if not result:
-            return result
-        insort = bisect.insort
-        pop = result.pop
-        los = result[-1]    # los --> Largest of the nsmallest
-        for elem in it:
-            if cmp_lt(elem, los):
-                insort(result, elem)
-                pop()
-                los = result[-1]
+    it = iter(iterable)
+    result = list(islice(it, n))
+    if not result:
         return result
-    # An alternative approach manifests the whole iterable in memory but
-    # saves comparisons by heapifying all at once.  Also, saves time
-    # over bisect.insort() which has O(n) data movement time for every
-    # insertion.  Finding the n smallest of an m length iterable requires
-    #    O(m) + O(n log m) comparisons.
-    h = list(iterable)
-    heapify(h)
-    return map(heappop, repeat(h, min(n, len(h))))
+    _heapify_max(result)
+    _heappushpop = _heappushpop_max
+    for elem in it:
+        _heappushpop(result, elem)
+    result.sort()
+    return result
 
 # 'heap' is a heap at all indices >= startpos, except possibly for pos.  pos
 # is the index of a leaf with a possibly out-of-order value.  Restore the
@@ -314,6 +312,42 @@
     heap[pos] = newitem
     _siftdown(heap, startpos, pos)
 
+def _siftdown_max(heap, startpos, pos):
+    'Maxheap variant of _siftdown'
+    newitem = heap[pos]
+    # Follow the path to the root, moving parents down until finding a place
+    # newitem fits.
+    while pos > startpos:
+        parentpos = (pos - 1) >> 1
+        parent = heap[parentpos]
+        if cmp_lt(parent, newitem):
+            heap[pos] = parent
+            pos = parentpos
+            continue
+        break
+    heap[pos] = newitem
+
+def _siftup_max(heap, pos):
+    'Maxheap variant of _siftup'
+    endpos = len(heap)
+    startpos = pos
+    newitem = heap[pos]
+    # Bubble up the larger child until hitting a leaf.
+    childpos = 2*pos + 1    # leftmost child position
+    while childpos < endpos:
+        # Set childpos to index of larger child.
+        rightpos = childpos + 1
+        if rightpos < endpos and not cmp_lt(heap[rightpos], heap[childpos]):
+            childpos = rightpos
+        # Move the larger child up.
+        heap[pos] = heap[childpos]
+        pos = childpos
+        childpos = 2*pos + 1
+    # The leaf at pos is empty now.  Put newitem there, and bubble it up
+    # to its final resting place (by sifting its parents down).
+    heap[pos] = newitem
+    _siftdown_max(heap, startpos, pos)
+
 # If available, use C implementation
 try:
     from _heapq import *

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list