[Numpy-svn] r5613 - in trunk/numpy/ma: . tests

Tue Aug 5 19:21:33 EDT 2008

Author: pierregm
Date: 2008-08-05 18:21:31 -0500 (Tue, 05 Aug 2008)
New Revision: 5613

Modified:
   trunk/numpy/ma/extras.py
   trunk/numpy/ma/tests/test_extras.py
Log:
* added cov and corrcoef to ma.extras for compatibility

Modified: trunk/numpy/ma/extras.py
===================================================================

--- trunk/numpy/ma/extras.py	2008-08-05 18:01:14 UTC (rev 5612)
+++ trunk/numpy/ma/extras.py	2008-08-05 23:21:31 UTC (rev 5613)
@@ -14,7 +14,7 @@
 __all__ = ['apply_along_axis', 'atleast_1d', 'atleast_2d', 'atleast_3d',
            'average',
            'column_stack','compress_cols','compress_rowcols', 'compress_rows',
-           'count_masked',
+           'count_masked', 'corrcoef', 'cov',
            'dot','dstack',
            'ediff1d','expand_dims',
            'flatnotmasked_contiguous','flatnotmasked_edges',
@@ -30,7 +30,7 @@
 from itertools import groupby
 import warnings
 
-import core
+import core as ma
 from core import MaskedArray, MAError, add, array, asarray, concatenate, count,\
     filled, getmask, getmaskarray, masked, masked_array, mask_or, nomask, ones,\
     sort, zeros
@@ -166,7 +166,7 @@
     arr is an N-d array.  i varies so as to apply the function along
     the given axis for each 1-d subarray in arr.
     """
-    arr = core.array(arr, copy=False, subok=True)
+    arr = array(arr, copy=False, subok=True)
     nd = arr.ndim
     if axis < 0:
         axis += nd
@@ -213,7 +213,7 @@
             dtypes.append(asarray(res).dtype)
             k += 1
     else:
-        res = core.array(res, copy=False, subok=True)
+        res = array(res, copy=False, subok=True)
         j = i.copy()
         j[axis] = ([slice(None, None)] * res.ndim)
         j.put(indlist, ind)
@@ -244,8 +244,8 @@
     if not hasattr(arr, '_mask'):
         result = np.asarray(outarr, dtype=max_dtypes)
     else:
-        result = core.asarray(outarr, dtype=max_dtypes)
-        result.fill_value = core.default_fill_value(result)
+        result = asarray(outarr, dtype=max_dtypes)
+        result.fill_value = ma.default_fill_value(result)
     return result
 
 def average(a, axis=None, weights=None, returned=False):
@@ -548,21 +548,19 @@
 def dot(a,b, strict=False):
     """Return the dot product of two 2D masked arrays a and b.
 
-    Like the generic numpy equivalent, the product sum is over the
-    last dimension of a and the second-to-last dimension of b.  If
-    strict is True, masked values are propagated: if a masked value
-    appears in a row or column, the whole row or column is considered
-    masked.
+    Like the generic numpy equivalent, the product sum is over the last 
+    dimension of a and the second-to-last dimension of b.  If strict is True, 
+    masked values are propagated: if a masked value appears in a row or column, 
+    the whole row or column is considered masked.
 
     Parameters
     ----------
-        strict : {boolean}
-            Whether masked data are propagated (True) or set to 0 for
-            the computation.
+    strict : {boolean}
+        Whether masked data are propagated (True) or set to 0 for the computation.
 
     Notes
     -----
-        The first argument is not conjugated.
+    The first argument is not conjugated.
 
     """
     #!!!: Works only with 2D arrays. There should be a way to get it to run with higher dimension
@@ -646,7 +644,132 @@
 
 
 
+def cov(x, y=None, rowvar=True, bias=False, allow_masked=True):
+    """Estimates the covariance matrix.
 
+    Normalization is by (N-1) where N is the number of observations (unbiased
+    estimate).  If bias is True then normalization is by N.
+
+    By default, masked values are recognized as such. If x and y have the same 
+    shape, a common mask is allocated: if x[i,j] is masked, then y[i,j] will also
+    be masked. 
+    Setting `allow_masked` to False will raise an exception if values are missing
+    in either of the input arrays.
+
+    Parameters
+    ----------
+    x : array-like
+        Input data.
+        If x is a 1D array, returns the variance.
+        If x is a 2D array, returns the covariance matrix.
+    y : {None, array-like}, optional
+        Optional set of variables.
+    rowvar : {False, True} optional
+        If rowvar is true, then each row is a variable with observations in columns.
+        If rowvar is False, each column is a variable and the observations are in
+        the rows.
+    bias : {False, True} optional
+        Whether to use a biased (True) or unbiased (False) estimate of the covariance.
+        If bias is True, then the normalization is by N, the number of observations.
+        Otherwise, the normalization is by (N-1).
+    allow_masked : {True, False} optional
+        If True, masked values are propagated pair-wise: if a value is masked in x,
+        the corresponding value is masked in y.
+        If False, raises a ValueError exception when some values are missing.
+
+    Raises
+    ------
+    ValueError:
+        Raised if some values are missing and allow_masked is False.
+
+    """
+    x = array(x, ndmin=2, copy=True, dtype=float)
+    xmask = getmaskarray(x)
+    # Quick exit if we can't process masked data
+    if not allow_masked and xmask.any():
+        raise ValueError("Cannot process masked data...")
+    #
+    if x.shape[0] == 1:
+        rowvar = True
+    # Make sure that rowvar is either 0 or 1
+    rowvar = int(bool(rowvar))
+    axis = 1-rowvar
+    if rowvar:
+        tup = (slice(None), None)
+    else:
+        tup = (None, slice(None))
+    #
+    if y is None:
+        xnotmask = np.logical_not(xmask).astype(int)
+    else:
+        y = array(y, copy=False, ndmin=2, dtype=float)
+        ymask = getmaskarray(y)
+        if not allow_masked and ymask.any():
+            raise ValueError("Cannot process masked data...")
+        if xmask.any() or ymask.any():
+            if y.shape == x.shape:
+                # Define some common mask
+                common_mask = np.logical_or(xmask, ymask)
+                if common_mask is not nomask:
+                    x.unshare_mask()
+                    y.unshare_mask()
+                    xmask = x._mask = y._mask = ymask = common_mask
+        x = concatenate((x,y),axis)
+        xnotmask = np.logical_not(np.concatenate((xmask, ymask), axis)).astype(int)
+
+    x -= x.mean(axis=rowvar)[tup]
+
+    if not rowvar:
+        fact = dot(xnotmask.T, xnotmask)*1. - (1 - bool(bias))
+        result = (dot(x.T, x.conj(), strict=False) / fact).squeeze()
+    else:
+        fact = dot(xnotmask, xnotmask.T)*1. - (1 - bool(bias))
+        result = (dot(x, x.T.conj(), strict=False) / fact).squeeze()
+    return result
+
+
+def corrcoef(x, y=None, rowvar=True, bias=False, allow_masked=True):
+    """The correlation coefficients formed from the array x, where the
+    rows are the observations, and the columns are variables.
+
+    corrcoef(x,y) where x and y are 1d arrays is the same as
+    corrcoef(transpose([x,y]))
+
+    Parameters
+    ----------
+    x : ndarray
+        Input data. If x is a 1D array, returns the variance.
+        If x is a 2D array, returns the covariance matrix.
+    y : {None, ndarray} optional
+        Optional set of variables.
+    rowvar : {False, True} optional
+        If True, then each row is a variable with observations in columns.
+        If False, each column is a variable and the observations are in the rows.
+    bias : {False, True} optional
+        Whether to use a biased (True) or unbiased (False) estimate of the
+        covariance.
+        If True, then the normalization is by N, the number of non-missing
+        observations.
+        Otherwise, the normalization is by (N-1).
+    allow_masked : {True, False} optional
+        If True, masked values are propagated pair-wise: if a value is masked
+        in x, the corresponding value is masked in y.
+        If False, raises an exception.
+
+    See Also
+    --------
+    cov
+
+    """
+
+    c = cov(x, y, rowvar, bias, allow_masked)
+    try:
+        d = ma.diag(c)
+    except ValueError: # scalar covariance
+        return 1
+    return c/ma.sqrt(ma.multiply.outer(d,d))
+
+
 #####--------------------------------------------------------------------------
 #---- --- Concatenation helpers ---
 #####--------------------------------------------------------------------------

Modified: trunk/numpy/ma/tests/test_extras.py
===================================================================
--- trunk/numpy/ma/tests/test_extras.py	2008-08-05 18:01:14 UTC (rev 5612)
+++ trunk/numpy/ma/tests/test_extras.py	2008-08-05 23:21:31 UTC (rev 5613)
@@ -12,7 +12,7 @@
 __date__     = '$Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $'
 
 import numpy
-from numpy.testing import *
+from numpy.testing import TestCase, run_module_suite
 from numpy.ma.testutils import *
 from numpy.ma.core import *
 from numpy.ma.extras import *
@@ -317,6 +317,7 @@
         assert_equal(dx._mask, np.r_[1,0,0,0,0,1])
 
 class TestApplyAlongAxis(TestCase):
+    #
     "Tests 2D functions"
     def test_3d(self):
         a = arange(12.).reshape(2,2,3)
@@ -325,7 +326,9 @@
         xa = apply_along_axis(myfunc,2,a)
         assert_equal(xa,[[1,4],[7,10]])
 
+
 class TestMedian(TestCase):
+    #
     def test_2d(self):
         "Tests median w/ 2D"
         (n,p) = (101,30)
@@ -352,6 +355,71 @@
         assert_equal(median(x,0), [[12,10],[8,9],[16,17]])
 
 
+class TestCov(TestCase):
+    #
+    def setUp(self):
+        self.data = array(np.random.rand(12))
+    #
+    def test_1d_wo_missing(self):
+        "Test cov on 1D variable w/o missing values"
+        x = self.data
+        assert_equal(np.cov(x), cov(x))
+        assert_equal(np.cov(x, rowvar=False), cov(x, rowvar=False))
+        assert_equal(np.cov(x, rowvar=False, bias=True),
+                     cov(x, rowvar=False, bias=True))
+    #
+    def test_2d_wo_missing(self):
+        "Test cov on 1 2D variable w/o missing values"
+        x = self.data.reshape(3,4)
+        assert_equal(np.cov(x), cov(x))
+        assert_equal(np.cov(x, rowvar=False), cov(x, rowvar=False))
+        assert_equal(np.cov(x, rowvar=False, bias=True),
+                     cov(x, rowvar=False, bias=True))
+    #
+    def test_1d_w_missing(self):
+        "Test cov 1 1D variable w/missing values"
+        x = self.data
+        x[-1] = masked
+        x -= x.mean()
+        nx = x.compressed()
+        assert_almost_equal(np.cov(nx), cov(x))
+        assert_almost_equal(np.cov(nx, rowvar=False), cov(x, rowvar=False))
+        assert_almost_equal(np.cov(nx, rowvar=False, bias=True),
+                            cov(x, rowvar=False, bias=True))
+        #
+        try:
+            cov(x, allow_masked=False)
+        except ValueError:
+            pass
+        #
+        # 2 1D variables w/ missing values
+        nx = x[1:-1]
+        assert_almost_equal(np.cov(nx, nx[::-1]), cov(x, x[::-1]))
+        assert_equal(np.cov(nx, nx[::-1], rowvar=False), 
+                     cov(x, x[::-1], rowvar=False))
+        assert_equal(np.cov(nx, nx[::-1], rowvar=False, bias=True),
+                     cov(x, x[::-1], rowvar=False, bias=True))
+    #
+    def test_2d_w_missing(self):
+        "Test cov on 2D variable w/ missing value"
+        x = self.data
+        x[-1] = masked
+        x = x.reshape(3,4)
+        valid = np.logical_not(getmaskarray(x)).astype(int)
+        frac = np.dot(valid, valid.T)
+        xf = (x - x.mean(1)[:,None]).filled(0)
+        assert_almost_equal(cov(x), np.cov(xf) * (x.shape[1]-1) / (frac - 1.))
+        assert_almost_equal(cov(x, bias=True), 
+                            np.cov(xf, bias=True) * x.shape[1] / frac)
+        frac = np.dot(valid.T, valid)
+        xf = (x - x.mean(0)).filled(0)
+        assert_almost_equal(cov(x, rowvar=False), 
+                            np.cov(xf, rowvar=False) * (x.shape[0]-1)/(frac - 1.))
+        assert_almost_equal(cov(x, rowvar=False, bias=True), 
+                            np.cov(xf, rowvar=False, bias=True) * x.shape[0]/frac)
+
+
+
 class TestPolynomial(TestCase):
     #
     def test_polyfit(self):