[SciPy-user] sparse version of stats.pearsonr ?

Mon Mar 9 23:54:17 EDT 2009

On Mon, Mar 9, 2009 at 7:18 PM, Peter Skomoroch
<peter.skomoroch at gmail.com> wrote:
> Here is what I have based on pearsonr in scipy.stats:
>
> def sparse_vector_dot(x, y):
>     '''Calculates the dot product for two sparse vectors'''
>     return (x.T*y).data[0]
>
> def sparse_pearsonr(x, y):
>     """Calculates a Pearson correlation coefficient and the p-value for
> testing
>     non-correlation using two sparse vectors as inputs.
>
>     Parameters
>     ----------
>     x : 1D sparse array
>     y : 1D sparse array the same length as x
>
>     Returns
>     -------
>     (Pearson's correlation coefficient,
>      2-tailed p-value)
>
>     References
>     ----------
>     http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation"""
>
>     # we form a third sparse vector z where the nonzero entries of z
>     # are the union of the nonzero entries in x and y
>     z = x + y
>     n = z.getnnz() #length of x
>     mx = x.data.mean()
>     my = y.data.mean()
>     # we only want to subtract the mean for non-zero values...
>     # so we copy & access the sparse vector components directly:
>     xm, ym = x, y
>     xm.data, ym.data = x.data-mx, y.data-my
>     r_num = n*(sparse_vector_dot(xm,ym))
>     r_den = n*sqrt(sparse_vector_dot(xm,xm)*sparse_vector_dot(ym,ym))
>     r = (r_num / r_den)
>
>     # Presumably, if r > 1, then it is only some small artifact of floating
>     # point arithmetic.
>     r = min(r, 1.0)
>     df = n-2
>
>     # Use a small floating point value to prevent divide-by-zero nonsense
>     # fixme: TINY is probably not the right value and this is probably not
>     # the way to be robust. The scheme used in spearmanr is probably better.
>     TINY = 1.0e-20
>     t = r*sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
>     prob = betai(0.5*df,0.5,df/(df+t*t))
>     return r,prob
> - Show quoted text -
>
> On Mon, Mar 9, 2009 at 6:53 PM, Peter Skomoroch <peter.skomoroch at gmail.com>
> wrote:
>>
>> Before I re-invent the wheel, is there an existing version of
>> stats.pearsonr(x,y) that will work with scipy.sparse vectors?
>>
>> -Pete

I don't know what the interpretation of zero values in the sparse
matrix would be in this case, but to me it looks like you are using
inconsistent  sample size n (where at least one of two is non-zero)
while the mean is based on a smaller n (non-zero for one vector). I
also thought that, how the zeros in one vector are treated, depends on
the zeros in the second vector, but maybe not because you subtract the
mean only from the non-zero values x.data-mx, y.data-my and zeros in
the dot product shouldn't matter.

According to my interpretation, you are replacing the zero elements in
the joint vectors (size n) by the means of each vector. Under this
interpretation the sample size and the degrees of freedom might be
correct. I don't know if this is the desired behavior.

I updated my version of the dense equivalent correlation a bit:

import numpy as np
from scipy import sparse, stats

I = np.array([0,3,1,0,4,5,5])
J = np.array([0,3,1,2,3,0,1])
V = np.array([4,5,7,9,3,1,2])

B = sparse.coo_matrix((V,(I,J)),shape=(6,4)).tocsr()

def cov_csr(x, axis=0):
    '''return covariance matrix, assumes column variable
    return type ndarray'''
    meanx = x.sum(axis=axis)/float(x.shape[axis])
    if axis == 0:
        return np.array((x.T*x)/x.shape[axis] - meanx.T*meanx)
    else:
        return np.array((x*x.T)/x.shape[axis] - meanx*meanx.T)

def corrcoef_csr(x, axis=0):
    '''correlation matrix, return type ndarray'''
    covx = cov_csr(x, axis=axis)
    stdx = np.sqrt(np.diag(covx))[np.newaxis,:]
    return covx/(stdx.T * stdx)

def pearsonr_csr(x, axis=0):
    '''returns correlation coefficient or matrix,
    t-statistic and p-value for null hypothesis of zero correlation
    (of pairwise correlation if more than 2 variables, columns)'''
    r = corrcoef_csr(x, axis=axis)
    TINY = 1e-15
    df = x.shape[axis]-2
    t = r*np.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
    if t.shape[axis] == 2:
        t = t[0,1]
        r = r[0,1]
    else:
        t += np.diag(np.inf*np.ones(t.shape[0]))
    return r, t, stats.t.sf(np.abs(t),df)*2

B1 = B[:,:2]
B1d = B1.todense()

print 'sparse cov:\n', cov_csr(B1)
print 'np.cov:\n', np.cov(B1d, rowvar=0, bias=1)
print 'sparse corrcoef:\n', corrcoef_csr(B1)
print 'np.corrcoef:\n', np.corrcoef(B1d, rowvar=0, bias=1)
r, t, p = pearsonr_csr(B1)
print 'sparse pearson r\n', r
print 'sparse t-test:t-stat\n', t
print 'sparse t-test:pvalue\n', p

rs, ps =stats.pearsonr(B1d[:,0],B1d[:,1])
print 'stats.pearsonr:', rs, ps

r0, t0, p0 = pearsonr_csr(B)
print 'sparse pearson r\n', r0
print 'sparse t-test:t-stat\n', t0
print 'sparse t-test:pvalue\n', p0
r1, t1, p1 = pearsonr_csr(B.T, axis=1)
##print 'sparse pearson r\n', r1
##print 'sparse t-test:t-stat\n', t1
##print 'sparse t-test:pvalue\n', p1

from numpy.testing import assert_equal, assert_almost_equal

assert_almost_equal(cov_csr(B1), np.cov(B1d, rowvar=0, bias=1), decimal=14)
assert_almost_equal(corrcoef_csr(B1), np.corrcoef(B1d, rowvar=0,
bias=1), decimal=14)
assert_almost_equal(r, rs, decimal=14)
assert_almost_equal(p, ps, decimal=14)
#test axis
assert_equal(r0, r1)
assert_equal(t0, t1)
assert_equal(p0, p1)
assert_equal(cov_csr(B), cov_csr(B.T, axis=1))
assert_equal(corrcoef_csr(B), corrcoef_csr(B.T, axis=1))

Josef