[Scipy-svn] r6417 - trunk/scipy/stats

scipy-svn at scipy.org scipy-svn at scipy.org
Thu May 27 17:31:26 EDT 2010


Author: josef
Date: 2010-05-27 16:31:25 -0500 (Thu, 27 May 2010)
New Revision: 6417

Modified:
   trunk/scipy/stats/stats.py
Log:
stats.spearmanr enhance with tiehandling, and 2d with axis ticket:822 ticket:1100

Modified: trunk/scipy/stats/stats.py
===================================================================
--- trunk/scipy/stats/stats.py	2010-05-27 14:15:43 UTC (rev 6416)
+++ trunk/scipy/stats/stats.py	2010-05-27 21:31:25 UTC (rev 6417)
@@ -2117,7 +2117,7 @@
     return r,prob
 
 
-def spearmanr(x, y):
+def spearmanr(a, b=None, axis=0):
     """
     Calculates a Spearman rank-order correlation coefficient and the p-value
     to test for non-correlation.
@@ -2141,19 +2141,33 @@
 
     Parameters
     ----------
-    x : 1D array
-        Must have length > 2
-    y : 1D array
-        Must have the same length as x.
+    a, b : 1D or 2D array_like, b is optional
+        One or two 1-D or 2-D arrays containing multiple variables and
+        observations. Each column of m represents a variable, and each row
+        entry a single observation of those variables. Also see axis below.
+        Both arrays need to have the same length in the `axis` dimension.
+        
+    axis : int or None, optional
+        If axis=0 (default), then each column represents a variable, with
+        observations in the rows. If axis=0, the relationship is transposed:
+        each row represents a variable, while the columns contain observations.
+        If axis=None, then both arrays will be raveled
 
     Returns
     -------
-    r : float
-        The Spearman correlation coefficient
+    rho: float or array (2D square)
+        Spearman correlation matrix or correlation coefficient (if only 2 variables
+        are given as parameters. Correlation matrix is square with length
+        equal to total number of variables (columns or rows) in a and b
+        combined
     p-value : float
         The two-sided p-value for a hypothesis test whose null hypothesis is
-        that the two sets of data are uncorrelated.
+        that two sets of data are uncorrelated, has same dimension as rho
 
+    Notes
+    -----
+    changes in scipy 0.8: rewrite to add tie-handling,  and axis
+
     References
     ----------
     [CRCProbStat2000]_ Section  14.7
@@ -2162,30 +2176,64 @@
        Probablity and Statistics Tables and Formulae. Chapman & Hall: New
        York. 2000.
 
+    Examples
+    --------
+
+    >>> spearmanr([1,2,3,4,5],[5,6,7,8,7])
+    (0.82078268166812329, 0.088587005313543798)
+    >>> np.random.seed(1234321)
+    >>> x2n=np.random.randn(100,2)
+    >>> y2n=np.random.randn(100,2)
+    >>> spearmanr(x2n)
+    (0.059969996999699973, 0.55338590803773591)
+    >>> spearmanr(x2n[:,0], x2n[:,1])
+    (0.059969996999699973, 0.55338590803773591)
+    >>> rho, pval = spearmanr(x2n,y2n)
+    >>> rho
+    array([[ 1.        ,  0.05997   ,  0.18569457,  0.06258626],
+           [ 0.05997   ,  1.        ,  0.110003  ,  0.02534653],
+           [ 0.18569457,  0.110003  ,  1.        ,  0.03488749],
+           [ 0.06258626,  0.02534653,  0.03488749,  1.        ]])
+    >>> pval
+    array([[ 0.        ,  0.55338591,  0.06435364,  0.53617935],
+           [ 0.55338591,  0.        ,  0.27592895,  0.80234077],
+           [ 0.06435364,  0.27592895,  0.        ,  0.73039992],
+           [ 0.53617935,  0.80234077,  0.73039992,  0.        ]])
+    >>> rho, pval = spearmanr(x2n.T, y2n.T, axis=1)
+    >>> rho
+    array([[ 1.        ,  0.05997   ,  0.18569457,  0.06258626],
+           [ 0.05997   ,  1.        ,  0.110003  ,  0.02534653],
+           [ 0.18569457,  0.110003  ,  1.        ,  0.03488749],
+           [ 0.06258626,  0.02534653,  0.03488749,  1.        ]])
+    >>> spearmanr(x2n, y2n, axis=None)
+    (0.10816770419260482, 0.1273562188027364)
+    >>> spearmanr(x2n.ravel(), y2n.ravel())
+    (0.10816770419260482, 0.1273562188027364)
+    
+    >>> xint = np.random.randint(10,size=(100,2))
+    >>> spearmanr(xint)
+    (0.052760927029710199, 0.60213045837062351)
+
     """
-    x = np.asanyarray(x)
-    y = np.asanyarray(y)
-    n = len(x)
-    m = len(y)
-    if n != m:
-        raise ValueError("lengths of x and y must match: %s != %s" % (n, m))
-    if n <= 2:
-        raise ValueError("length must be > 2")
-    rankx = rankdata(x)
-    ranky = rankdata(y)
-    dsq = np.add.reduce((rankx-ranky)**2)
-    rs = 1 - 6*dsq / float(n*(n**2-1))
-    df = n-2
+    a, axisout = _chk_asarray(a, axis)
+    ar = np.apply_along_axis(rankdata,axisout,a)
+    
+    br = None
+    if not b is None:
+        b, axisout = _chk_asarray(b, axis)
+        br = np.apply_along_axis(rankdata,axisout,b)
+    n = a.shape[axisout]
+    rs = np.corrcoef(ar,br,rowvar=axisout)
 
-    try:
-        t = rs * np.sqrt((n-2) / ((rs+1.0)*(1.0-rs)))
-        probrs = betai(0.5*df, 0.5, df/(df+t*t))
-    except ZeroDivisionError:
-        probrs = 0.0
+    t = rs * np.sqrt((n-2) / ((rs+1.0)*(1.0-rs)))
+    prob = distributions.t.sf(np.abs(t),n-2)*2
+    
+    if rs.shape == (2,2):
+        return rs[1,0], prob[1,0]
+    else:
+        return rs, prob
 
-    return rs, probrs
 
-
 def pointbiserialr(x, y):
     # comment: I am changing the semantics somewhat. The original function is
     # fairly general and accepts an x sequence that has any type of thing in it as




More information about the Scipy-svn mailing list