Selecting k smallest or largest elements from a large list in python; (benchmarking)

Dmitry Chichkov dchichkov at gmail.com
Wed Sep 1 21:08:25 EDT 2010


Given: a large list (10,000,000) of floating point numbers;
Task: fastest python code that finds k (small, e.g. 10) smallest
items, preferably with item indexes;
Limitations: in python, using only standard libraries (numpy & scipy
is Ok);

I've tried several methods. With N = 10,000,000, K = 10 The fastest so
far (without item indexes) was pure python implementation
nsmallest_slott_bisect (using bisect/insert). And with indexes
nargsmallest_numpy_argmin (argmin() in the numpy array k times).

Anyone up to the challenge beating my code with some clever selection
algorithm?

Current Table:
1.66864395142 mins_heapq(items, n):
0.946580886841 nsmallest_slott_bisect(items, n):
1.38014793396 nargsmallest(items, n):
10.0732769966 sorted(items)[:n]:
3.17916202545 nargsmallest_numpy_argsort(items, n):
1.31794500351 nargsmallest_numpy_argmin(items, n):
2.37499308586 nargsmallest_numpy_array_argsort(items, n):
0.524670124054 nargsmallest_numpy_array_argmin(items, n):

0.0525538921356 numpy argmin(items): 1892997
0.364673852921 min(items): 10.0000026786


Code:
----------------
import heapq
from random import randint, random
import time
from bisect    import insort
from itertools import islice
from operator import itemgetter

def mins_heapq(items, n):
    nlesser_items = heapq.nsmallest(n, items)
    return nlesser_items

def nsmallest_slott_bisect(iterable, n, insort=insort):
    it   = iter(iterable)
    mins = sorted(islice(it, n))
    for el in it:
        if el <= mins[-1]: #NOTE: equal sign is to preserve duplicates
            insort(mins, el)
            mins.pop()
    return mins

def nargsmallest(iterable, n, insort=insort):
    it   = enumerate(iterable)
    mins = sorted(islice(it, n), key = itemgetter(1))
    loser = mins[-1][1]         # largest of smallest
    for el in it:
        if el[1] <= loser:        # NOTE: equal sign is to preserve
dupl
            mins.append(el)
            mins.sort(key = itemgetter(1))
            mins.pop()
            loser = mins[-1][1]
    return mins

def nargsmallest_numpy_argsort(iter, k):
    distances = N.asarray(iter)
    return [(i, distances[i]) for i in distances.argsort()[0:k]]

def nargsmallest_numpy_array_argsort(array, k):
    return [(i, array[i]) for i in array.argsort()[0:k]]

def nargsmallest_numpy_argmin(iter, k):
    distances = N.asarray(iter)
    mins = []

def nargsmallest_numpy_array_argmin(distances, k):
    mins = []
    for i in xrange(k):
        j = distances.argmin()
        mins.append((j, distances[j]))
        distances[j] = float('inf')

    return mins


test_data = [randint(10, 50) + random() for i in range(10000000)]
K = 10

init = time.time()
mins = mins_heapq(test_data, K)
print time.time() - init, 'mins_heapq(items, n):', mins[:2]

init = time.time()
mins = nsmallest_slott_bisect(test_data, K)
print time.time() - init, 'nsmallest_slott_bisect(items, n):', mins[:
2]

init = time.time()
mins = nargsmallest(test_data, K)
print time.time() - init, 'nargsmallest(items, n):', mins[:2]

init = time.time()
mins = sorted(test_data)[:K]
print time.time() - init, 'sorted(items)[:n]:', time.time() - init,
mins[:2]

import numpy as N
init = time.time()
mins = nargsmallest_numpy_argsort(test_data, K)
print time.time() - init, 'nargsmallest_numpy_argsort(items, n):',
mins[:2]

init = time.time()
mins = nargsmallest_numpy_argmin(test_data, K)
print time.time() - init, 'nargsmallest_numpy_argmin(items, n):',
mins[:2]


print
init = time.time()
mins = array.argmin()
print time.time() - init, 'numpy argmin(items):', mins

init = time.time()
mins = min(test_data)
print time.time() - init, 'min(items):', mins




More information about the Python-list mailing list