[SciPy-dev] Should ndimage.measurements.* should return lists if index is a list?

Tue May 5 14:54:15 EDT 2009

On Mon, May 4, 2009 at 5:11 PM, Thouis (Ray) Jones <thouis at broad.mit.edu> wrote:
> On Mon, May 4, 2009 at 16:43,  <josef.pktd at gmail.com> wrote:
>> The use case, I wrote my application for, also allowed for labels that
>> are strings. The return inverse of unique1d is very flexible since it
>> can handle many different types including structured arrays, but for
>> numeric labels, I just found out, that it is a large speed penalty. If
>> the labels are already integers or can be easily converted to
>> integers, then not using unique1d to create the integer labels should
>> be much faster.
>
> Using bincount probably requires calling unique1d (or some equivalent)
> for this usage, since otherwise the call to bincount could create a
> very large output (2^32 or even 2^64 bins).
>
> Ray Jones

In my version, I started to use a conditional call to unique1d, if the
type of the labels is not int (to handle float and strings). This
could be extended to a very large max(labels).  If the label type is
int then, I just use bincount without unique1d.

I tried it with 50% unused numbers, and the speed of bincount is
comparable to ndimage, labels are generated with
factors = (100+2*np.random.randint(nfact,size=nobs))
#.astype('S6')#.astype(float)
I tried with up to 4 million observations and 10000 unique labels.

I'm starting to get confused with the many different versions, that I
have now for this, the two typical examples are below. ndimage looks
very picky which arrays it accepts, every once in a while I get a not
implemented type exception or even a segfault.

I also attach my full timing and test script.

Josef

def labelmean_ndi(factors, values, index=None):
    # works also for string labels in ys, but requires 1D
    # from mailing list scipy-user 2009-02-11
    unilinv = np.asanyarray(factors)

    if not np.issubdtype(unilinv.dtype, int):
        unil, unilinv = np.unique1d(factors, return_inverse=1)
        if index is None:
            index = np.arange(len(unil)) #unil
    elif index is None:
        index = np.arange(np.max(unilinv)+ 1)

    labelmeans = np.array(ndimage.mean(values, labels=unilinv, index=index))
    return labelmeans

def groupstatsbin(factors, values, ddof=0, stat='mvnx', drop=True):
    '''uses np.bincount, assumes factors/labels are integers,
    create labels with unique1d return_inverse if string labels
    '''
    #n = len(factors)

    rind = np.asanyarray(factors)
    #note: bincount uses complete list of integers, maybe range(max(factor)+1)
    if not np.issubdtype(rind.dtype, int):
        ix, rind = np.unique1d(factors, return_inverse=1)
##    else:
##        rind = factors
    #print rind.shape
    res = []
    if 'c' in stat or 'm' in stat or 'v' in stat:
        gcount = np.bincount(rind)
        if drop:
            keepidx = np.nonzero(gcount)
        if 'c' in stat:
            if drop:
                res.append(gcount[keepidx])
            else:
                res.append(gcount)

    if 'm' in stat or 'v' in stat:
        gmean = np.bincount(rind, weights=values)/ (1.0*gcount)
        if np.max(np.abs(gmean)) > 1e6:
            # improve numerical precision if means are large
            # (to "cheat" on NIST anova test examples)
            meanarr = gmean[rind]
            gmean += np.bincount(rind, weights=(values-meanarr)) / (1.0*gcount)
        if 'm' in stat:
            if drop:
                res.append(gmean[keepidx])
            else:
                res.append(gmean)

    if 'v' in stat:
        meanarr = gmean[rind]
        withinvar = np.bincount(rind, weights=(values-meanarr)**2) /
(1.0*gcount-ddof)
        if drop:
            res.append(withinvar[keepidx])
        else:
            res.append(withinvar)

    if 'n' in stat or 'x' in stat:
        #calculate min, max per factor
        sortind = np.lexsort((values, rind))
        fs = rind[sortind]
        vs = values[sortind]
        fsd = np.hstack((np.inf,np.diff(fs),np.inf))
    if 'n' in stat:
        #minimum
        res.append(vs[fsd[:-1] != 0])
    if 'x' in stat:
        #maximum
        res.append(vs[fsd[1:] != 0])
    return res
-------------- next part --------------

import numpy as np
from numpy.testing import assert_array_almost_equal
from scipy import ndimage

def groupmeanbin(factors, values):
    '''uses np.bincount, assumes factors/labels are integers
    '''
    #n = len(factors)
    ix,rind = np.unique1d(factors, return_inverse=1)
    gcount = np.bincount(rind)
    gmean = np.bincount(rind, weights=values)/ (1.0*gcount)
    return gmean

def labelmean_str(factors, values):
    # works also for string labels in ys, but requires 1D
    # from mailing list scipy-user 2009-02-11
    # check mistake: returns one element to much
    unil,unilinv = np.unique1d(factors, return_index=False, return_inverse=True)

    #labelmeans = np.array(ndimage.mean(values, labels=unilinv, index=np.arange(len(unil)+1)))
    labelmeans = np.array(ndimage.mean(values, labels=unilinv, index=np.arange(len(unil))))
    return labelmeans

def labelmean_dig(factors, values):
    # works also for string labels in ys, but requires 1D
    # from mailing list scipy-user 2009-02-11
    # check mistake: returns one element to much
    #unil = np.unique1d(factors, return_index=False, return_inverse=False)
    unil = np.unique1d(factors)
    unilinv = np.digitize(factors, unil).astype('int64')
    #print unilinv.shape
    #print unilinv.dtype
    #labelmeans = np.array(ndimage.mean(values, labels=unilinv, index=np.arange(len(unil)+1)))
    labelmeans = np.array(ndimage.mean(values, labels=unilinv, index=np.arange(len(unil))))
    return labelmeans

def labelmean_ndi(factors, values, index=None):
    # works also for string labels in ys, but requires 1D
    # from mailing list scipy-user 2009-02-11
    # check mistake: returns one element to much
    #unil,unilinv = np.unique1d(factors, return_index=False, return_inverse=True)
    unilinv = np.asanyarray(factors)

    #note: bincount uses complete list of integers, maybe range(max(factor)+1)
    if not np.issubdtype(unilinv.dtype, int):
        unil, unilinv = np.unique1d(factors, return_inverse=1)
        if index is None:
            index = np.arange(len(unil)) #unil
    elif index is None:
        index = np.arange(np.max(unilinv)+ 1)

    labelmeans = np.array(ndimage.mean(values, labels=unilinv, index=index))
    return labelmeans

def bincount2d(factors):
    # array check copied from np.histogramdd
    try:
        # Sample is an ND-array.
        N, D = factors.shape
        sample = factors
    except (AttributeError, ValueError):
        # Sample is a sequence of 1D arrays.
        sample = np.atleast_2d(factors).T
        N, D = sample.shape
    tmp = np.ascontiguousarray(sample)
    factarr = tmp.view([('',tmp.dtype)]*tmp.shape[-1])
    uni, rind = np.unique1d(factarr, return_inverse=1)
    return np.bincount(rind), uni.view(tmp.dtype).reshape(-1,D)
    #or return unique values as structured array:
    #return np.bincount(rind), uni

def groupstatsbin(factors, values, ddof=0, stat='mvnx', drop=True):
    '''uses np.bincount, assumes factors/labels are integers,
    create labels with unique1d return_inverse if string labels
    '''
    #n = len(factors)

    rind = np.asanyarray(factors)
    #note: bincount uses complete list of integers, maybe range(max(factor)+1)
    if not np.issubdtype(rind.dtype, int):
        ix, rind = np.unique1d(factors, return_inverse=1)
##    else:
##        rind = factors
    #print rind.shape
    res = []
    if 'c' in stat or 'm' in stat or 'v' in stat:
        gcount = np.bincount(rind)
        if drop:
            keepidx = np.nonzero(gcount)
        if 'c' in stat:
            if drop:
                res.append(gcount[keepidx])
            else:
                res.append(gcount)

    if 'm' in stat or 'v' in stat:
        #gcount = np.bincount(rind)
        gmean = np.bincount(rind, weights=values)/ (1.0*gcount)
        if np.max(np.abs(gmean)) > 1e6:
            # improve numerical precision if means are large
            meanarr = gmean[rind]
            gmean += np.bincount(rind, weights=(values-meanarr)) / (1.0*gcount)
        if 'm' in stat:
            if drop:
                res.append(gmean[keepidx])
            else:
                res.append(gmean)

    if 'v' in stat:
        meanarr = gmean[rind]
        withinvar = np.bincount(rind, weights=(values-meanarr)**2) / (1.0*gcount-ddof)
        if drop:
            res.append(withinvar[keepidx])
        else:
            res.append(withinvar)

    #withinvararr = withinvar[rind]
    #return gcount, gmean , meanarr, withinvar, withinvararr

    if 'n' in stat or 'x' in stat:
        #calculate min, max per factor
        sortind = np.lexsort((values, rind))
        fs = rind[sortind]
        vs = values[sortind]
        fsd = np.hstack((np.inf,np.diff(fs),np.inf))
    if 'n' in stat:
        #minimum
        res.append(vs[fsd[:-1] != 0])
    if 'x' in stat:
        #maximum
        res.append(vs[fsd[1:] != 0])
    return res

def labelstats_str(factors, values, stat='mvnx'):
    # works also for string labels in ys, but requires 1D
    # from mailing list scipy-user 2009-02-11
    unil, unilinv = np.unique1d(factors, return_index=False, return_inverse=True)
    res = []
    if 'm' in stat:
        labelmeans = np.array(ndimage.mean(values, labels=unilinv,
                                index=np.arange(len(unil))))
        res.append(labelmeans)
    if 'v' in stat:
        labelvars = np.array(ndimage.variance(values, labels=unilinv,
                                index=np.arange(len(unil))))
        res.append(labelvars)
    if 'n' in stat:
        labelmin = np.array(ndimage.minimum(values, labels=unilinv,
                                index=np.arange(len(unil))))
        res.append(labelmin)
    if 'x' in stat:
        labelmax = np.array(ndimage.maximum(values, labels=unilinv,
                                index=np.arange(len(unil))))
        res.append(labelmax)
    return res

nobs = 40000
nfact = 1000
factors = (100+2*np.random.randint(nfact,size=nobs))#.astype('S6')#.astype(float)#copy()#
#factors = np.random.randint(nfact,size=nobs)
#factors = np.array(factors.tolist(),int)
print factors.dtype
values = np.random.randn(nobs)
indexs = 100 + 2*np.arange(nfact)

def test_compare(nrun):
    nobs = 100
    nfact = 5
    for i in range(nrun):
        #factors = (100*np.random.randint(nfact,size=nobs)).astype('S2')
        factors = (np.random.randint(nfact,size=nobs))#.astype('S2')
        values = 1e0 + 1.e1 * np.random.randn(nobs)
        m,v,n,x = groupstatsbin(factors, values, ddof=1)
        #print factors.shape, values.shape, m.shape, v.shape, n.shape, x.shape
        assert_array_almost_equal(groupstatsbin(factors, values, ddof=1), \
            labelstats_str(factors, values), decimal=10, err_msg = \
            repr(np.asarray(groupstatsbin(factors, values, ddof=1, stat='mvnx')) - \
            np.asarray(labelstats_str(factors, values))) + str(i))

if __name__ == "__main__":

    #print groupstatsbin(factors, values)
    #print labelmean_str(factors, values)

    setupstr = '''from __main__ import np, groupmeanbin, labelmean_str, \
               labelmean_dig, labelmean_ndi, groupstatsbin, labelstats_str, \
               factors, values, indexs'''

    from timeit import Timer

    n = 10
    t1 = Timer(setup=setupstr, stmt='groupmeanbin(factors, values)')
    t2 = Timer(setup=setupstr, stmt='labelmean_str(factors, values)')
    t2a = Timer(setup=setupstr, stmt='labelmean_dig(factors, values)')
    t2b = Timer(setup=setupstr+'; factors=factors.tolist()', \
                                stmt='labelmean_ndi(factors, values,indexs)')
    t2c = Timer(setup=setupstr+'; factors2=np.array(factors.tolist()); from scipy import ndimage', \
                                stmt='np.array(ndimage.mean(values, labels=factors2, index=indexs))')
    t3a = Timer(setup=setupstr, stmt="groupstatsbin(factors, values, stat='m')")
    t3 = Timer(setup=setupstr, stmt="groupstatsbin(factors, values, stat='mv')")
    t4 = Timer(setup=setupstr, stmt="labelstats_str(factors, values, stat='mv')")
    t5 = Timer(setup=setupstr, stmt="groupstatsbin(factors, values, stat='nx')")
    t6 = Timer(setup=setupstr, stmt="labelstats_str(factors, values, stat='nx')")

    print 'number of observations %d, factors %d' % (nobs, nfact)
    print 'number of runs %d' % n
    print 'np.bincount ', t1.timeit(n)/float(n)
    print 'ndimage.mean', t2.timeit(n)/float(n)
    #digitize is slow, drop it
    #print 'ndimage dig ', t2a.timeit(n)/float(n)
    print 'ndimage ndi ', t2b.timeit(n)/float(n)
    if not np.issubdtype(factors.dtype, str):
        print 'ndimage ndipl', t2c.timeit(n)/float(n)
    print 'np.bincount m', t3a.timeit(n)/float(n)
    print 'np.bincount mv', t3.timeit(n)/float(n)
    print 'ndimage     mv', t4.timeit(n)/float(n)
    print 'np.bincount nx', t5.timeit(n)/float(n)
    print 'ndimage     nx', t6.timeit(n)/float(n)

    test_compare(10)

    debug = 0
    if debug:
        nobs = 100
        nfact = 5
        for i in range(1):
            factors = 10+ np.random.randint(nfact,size=nobs)#.astype('S2')
            values = 1e6 + np.random.randn(nobs)
            print np.array(groupstatsbin(factors, values,ddof=1)) - \
                           np.array(labelstats_str(factors, values))#[:,:-1]

        sortind = np.lexsort((values, factors))
        fs = factors[sortind]
        vs = values[sortind]
        #fsd = np.inf*np.ones(len(fs)+)
        fsd = np.hstack((np.inf,np.diff(fs),np.inf)) #fs[:-1]-fs[1:]
        gmin = vs[fsd[:-1] != 0]
        gmax = vs[fsd[1:] != 0]
        print gmin
        print ndimage.minimum(values, labels=factors.tolist(), index=np.arange(15))
        print gmax
        print ndimage.maximum(values, labels=factors.tolist(), index=np.arange(15))

        m,v,n,x = groupstatsbin(factors, values,ddof=1)
        (values[factors==0]-1e6).mean() - ((values[factors==0]).mean()-1e6)
        m1=values[factors==0].mean();
        m1+(values[factors==0]-m1).mean() - (m[0])
        m1=1e6;
        m1+(values[factors==0]-m1).mean() - (m[0])
        m1=0;
        m1+(values[factors==0]-m1).mean() - (m[0])

    debug2 = 0
    if debug2:
        print bincount2d(factors)
        print np.bincount(factors)
        factors = np.random.randint(5,size=(100,3))
        print bincount2d(factors)
        nbins = 2
        nbinsp1 = nbins + 1
        digitizedarr1 = (factors - factors % nbinsp1) / nbinsp1
        digitizedarr = np.mod(factors,nbins)
        #assert np.all(digitizedarr1==digitizedarr)
        cb, bb = bincount2d(digitizedarr1)
        ch, eh = np.histogramdd(digitizedarr1,nbins)
        print cb
        print ch
        assert np.all(cb==ch.ravel())
        print bb