[MATRIX-SIG] Calculating correlation Matrix

Janne Sinkkonen janne@avocado.pc.helsinki.fi
16 Jul 1997 11:43:25 +0300


hyoon@nyptsrv1.etsd.ml.com (Hoon Yoon - IPT Quant) writes:

>   Also, if anyone know of good stat func for NumPy, I would really
>   appreciate the source.
> Thanks much in advance,

Here are some elementary functions including moments (and correlation
matrices) and something to help you with bootstrapping.

# This is from Konrad Hinsen.

import Numeric

def histogram(data, nbins, range = None):
    data = Numeric.array(data)
    if range is None:
        min = Numeric.minimum.reduce(data)
        max = Numeric.maximum.reduce(data)
    else:
        min, max = range
        data = Numeric.repeat(data,
                              Numeric.logical_and(Numeric.less_equal(data, max),
                                                  Numeric.greater_equal(data,
                                                                        min)))
    bin_width = (max-min)/nbins
    data = Numeric.array(Numeric.floor((data - min)/bin_width), Numeric.Int)
    histo = Numeric.add.reduce(Numeric.equal(
        Numeric.arange(nbins)[:,Numeric.NewAxis], data), -1)
    bins = min + bin_width*(Numeric.arange(nbins)+0.5)
    return Numeric.transpose(Numeric.array([bins, histo]))


# I use these as well

from Numeric import *

def mean(X):
    N=X.shape[0]
    return sum(X)/N

def variance(X):
    N=X.shape[0]
    return sum(X**2)/N-(sum(X)/N)**2

def sample_variance(X):
    N=X.shape[0]
    return (sum(X**2)-(sum(X)**2)/N)/(N-1)

def std(X):
    return sqrt(variance(X))

def sample_std(X):
    return sqrt(sample_variance(X))

def covariance(X):
    N=X.shape[0]
    Xt=transpose(X)
    mX=sum(X)/N
    # Note that dot() equals to sum(multiply.outer()).
    # The latter is more memory-hungry.
    return dot(Xt,X)/N-multiply.outer(mX,mX)

def covariance2(X,Y):
    N,M=X.shape[0],Y.shape[0]
    Xt=transpose(X)
    mX,mY=sum(X)/N,sum(Y)/N
    return dot(Xt,Y)/N-multiply.outer(mX,mY)

def correlation(X):
    C=covariance(X)
    V=diagonal(C)
    return C/sqrt(multiply.outer(V,V))

def correlation2(X,Y):
    C,VX,VY=covariance2(X,Y),variance(X),variance(Y)
    return C/sqrt(multiply.outer(VX,VY))

def normalize(a):
    m=mean(a)
    s=std(a)
    return (a-m)/s, m, s

def normalize_from(a,b):
    m=mean(b)
    s=std(b)
    return (a-m)/s

# Euclidean distances
def squared_distances(X,Y):
    return add.outer(sum(X*X,-1),sum(Y*Y,-1))- 2*dot(X,transpose(Y))

# Some functions to make bootstrapping easier.

from Numeric import *
from RandomArray import *
import string
import simpleStat

def bsIndices(n):
    "Returns bootstrap indices for a sample of size N."
    r=randint(0,n,n)
    # The modulus is needless for future versions of Numpy
    # and even now for (all?) non-Inter platforms.
    return r%n

def bsVector(a):
    "Returns the bootstrap vector generated from the bootstrap INDICES."
    # This looks ugly but it is (relatively) fast.
    b=concatenate((searchsorted(sort(a),arange(len(a))),(len(a),)))
    return (b[1:]-b[:-1]).astype(Float)/len(a)

def bsStd(sample,estimator,n):
    "Based on SAMPLE, estimates standard deviation of ESTIMATOR
     using N bootstrap repetitions."
    estimates=[]
    for k in xrange(n):
	ii=bsIndices(len(sample))
	estimates.append(estimator(take(sample,ii)))
    r=array(estimates)
    return (simpleStat.mean(r),
	    simpleStat.sample_std(r))

def bsDistribution_w(sample,w_estimator,n):
    "Based on SAMPLE, returns samples from the distribution of W_ESTIMATOR
     using N bootstrap repetitions. W_ESTIMATOR takes the sample
     and weights as its arguments."
    estimates=[]
    l=len(sample)
    even=ones(l,Float)/l
    for k in xrange(n):
	ii=bsIndices(l)
	estimates.append(w_estimator(take(sample,ii),even))
    r=array(estimates)
    return r

def bsStdBias(sample,w_estimator,n):
    estimates=[]
    l=len(sample)
    B=zeros(l,Float)
    even=ones(l,Float)/l
    for k in xrange(n):
	ii=bsIndices(l)
	B=B+bsVector(ii)
	estimates.append(w_estimator(take(sample,ii),even))
    r=sort(array(estimates))
    return (simpleStat.mean(r),
	    simpleStat.sample_std(r),
	    simpleStat.mean(r)-w_estimator(sample,B/n),
	    r[int(len(r)/2)],
	    r[int(3*len(r)/4)]-r[int(1*len(r)/4)])




-- 
Janne Sinkkonen      <janne@iki.fi>      <URL: http://www.iki.fi/~janne/ >

_______________
MATRIX-SIG  - SIG on Matrix Math for Python

send messages to: matrix-sig@python.org
administrivia to: matrix-sig-request@python.org
_______________