[SciPy-dev] Should ndimage.measurements.* should return lists if index is a list?

Fri May 1 16:11:53 EDT 2009

2009/5/1 Stéfan van der Walt <stefan at sun.ac.za>:
> 2009/5/1 Thouis (Ray) Jones <thouis at broad.mit.edu>:
>> The new code's about 5 times slower than the current tree, *not*
>> taking into account time spent in the _labeled_reduce function(s).
>> Most of that time is the call to argsort() and the subsequent
>> reorderings in labeled_reduce.  There might be some tricks to avoid
>> those.
>>
>> The code is 10-20 times slower altogether.
>
> I don't think we should read too much into that.  After we've
> optimised the code using Cython, we may very well be cose to where we
> started.
>
> Either way, in my opinion a 5 times slow-down in exchange for a
> maintainable library is well worth it.
>

After a recent comment by Anne, I looked at the weights option in np.bincount.
It is as fast to calculate group/label means with np.bincount as with
the current ndimage and it scales the same, but it works only for all
indices and not for min, max.
weights can be calculated for any element wise function.
group labels can be anything that np.unique1d can handle, string
labels take twice as long

Josef

Here are some timeit results:

number of observations 100000, factors 10000
number of runs 100
group/label mean
np.bincount  0.0382392734272
ndimage.mean 0.0387848287981
group/label mean and variance
np.bincount mv 0.0417609612395
ndimage     mv 0.0501527009718
>>>
number of observations 100000, factors 1000
number of runs 100
np.bincount  0.0356244774126
ndimage.mean 0.0349412732624
np.bincount mv 0.0448310357881
ndimage     mv 0.041015631875
>>>
number of observations 100000, factors 100
number of runs 100
np.bincount  0.0296716174821
ndimage.mean 0.0300010113017
np.bincount mv 0.0360344764488
ndimage     mv 0.0352608138744

number of observations 1000000, factors 100000
number of runs 100
np.bincount  0.967738271205
ndimage.mean 0.83807955201
np.bincount mv 0.96804377372
ndimage     mv 1.08760137392
-------------- next part --------------

import numpy as np
from scipy import ndimage

def groupmeanbin(factors, values):
    '''uses np.bincount, assumes factors/labels are integers
    '''
    #n = len(factors)
    ix,rind = np.unique1d(factors, return_inverse=1)
    gcount = np.bincount(rind)
    gmean = np.bincount(rind, weights=values)/ (1.0*gcount)
    return gmean

def labelmean_str(factors, values):
    # works also for string labels in ys, but requires 1D
    # from mailing list scipy-user 2009-02-11
    unil, unilinv = np.unique1d(factors, return_index=False, return_inverse=True)
    labelmeans = np.array(ndimage.mean(values, labels=unilinv, index=np.arange(len(unil)+1)))
    return labelmeans

def groupstatsbin(factors, values):
    '''uses np.bincount, assumes factors/labels are integers
    '''
    n = len(factors)
    ix,rind = np.unique1d(factors, return_inverse=1)
    gcount = np.bincount(rind)
    gmean = np.bincount(rind, weights=values)/ (1.0*gcount)
    meanarr = gmean[rind]
    withinvar = np.bincount(rind, weights=(values-meanarr)**2) / (1.0*gcount)
    #withinvararr = withinvar[rind]
    #return gcount, gmean , meanarr, withinvar, withinvararr
    return gmean, withinvar

def labelstats_str(factors, values):
    # works also for string labels in ys, but requires 1D
    # from mailing list scipy-user 2009-02-11
    unil, unilinv = np.unique1d(factors, return_index=False, return_inverse=True)
    labelmeans = np.array(ndimage.mean(values, labels=unilinv, index=np.arange(len(unil)+1)))
    labelvars = np.array(ndimage.variance(values, labels=unilinv, index=np.arange(len(unil)+1)))
    return labelmeans, labelvars

nobs = 100000
nfact = 100
factors = np.random.randint(nfact,size=nobs)#.astype('S4')
values = np.random.randn(nobs)

if __name__ == "__main__":

    #print groupstatsbin(factors, values)
    #print labelmean_str(factors, values)

    setupstr = '''from __main__ import np, groupmeanbin, labelmean_str, \
                groupstatsbin, labelstats_str, factors, values'''

    from timeit import Timer

    n = 100
    t1 = Timer(setup=setupstr, stmt='groupmeanbin(factors, values)')
    t2 = Timer(setup=setupstr, stmt='labelmean_str(factors, values)')
    t3 = Timer(setup=setupstr, stmt='groupstatsbin(factors, values)')
    t4 = Timer(setup=setupstr, stmt='labelstats_str(factors, values)')

    print 'number of observations %d, factors %d' % (nobs, nfact)
    print 'number of runs %d' % n
    print 'np.bincount ', t1.timeit(n)/float(n)
    print 'ndimage.mean', t2.timeit(n)/float(n)
    print 'np.bincount mv', t3.timeit(n)/float(n)
    print 'ndimage     mv', t4.timeit(n)/float(n)