[SciPy-dev] Standard deviations

Tue Nov 29 15:33:08 EST 2005

Hi all,

I have three questions related to standard deviations and variances in 
scipy.

First, can someone explain the behaviour of array.std() without any 
arguments?

 >>> a = arange(30).reshape(3,10)
 >>> a
array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]])
 >>> a.std()
array([ 2.99856287,  2.85723522,  2.74647109,  2.67007684,  2.63104804,
        2.63104804,  2.67007684,  2.74647109,  2.85723522,  2.99856287])

I don't understand what these numbers represent.  The correct standard 
deviations of the column vectors are given by:

 >>> a.std(0)
array([ 10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.])

and the standard deviations of the row vectors are:

 >>> a.std(1)
array([ 3.02765035,  3.02765035,  3.02765035])

I would have expected a.std() to give the same output as
 >>> a.ravel().std()
8.8034084308295046

which is what a.mean() does.



Second, I'd like to point out that some of the functions in Lib/stats/ 
have a different convention to scipy core about whether operations are 
performed row-wise or column-wise, and whether anyone would object to my 
changing the stats functions to operate column-wise.  At the moment we 
get this:

 >>> average(a)
array([ 10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.])

which is column-wise, but

 >>> std(a)
array([ 3.02765035,  3.02765035,  3.02765035])

which is row-wise.  I presume the default behaviour of std() and friends 
is just a historical relic.  If so we'd be wise to get this straight 
well before a 1.0 release.



Third, I'd like to request that we add an array.var() method to scipy 
core to compute an array's sample variance.

At the moment it seems that there is no way to compute the sample 
variance of an array of numbers without installing the full scipy.  
Users needing to do this will either have to roll their own function in 
Python, like this:

def var(A):
    m = len(A)
    return average((a-means)**2) * (m/(m-1.))

or square the output of std().  Both are less efficient than a native 
array.var() would be, requiring extra memory copying and, in the second 
case, squaring the result of a square root operation, which also 
introduces numerical imprecision.

The extra code required is minimal.  There's an example patch below, 
which works fine except that it inherits the weirdness of std().

Comments? :)

-- Ed


---------------------

Index: base/src/multiarraymodule.c
===================================================================

--- base/src/multiarraymodule.c (revision 1534)
+++ base/src/multiarraymodule.c (working copy)
@@ -460,7 +460,61 @@
        return obj1;
 }

+static PyObject *
+PyArray_Var(PyArrayObject *self, int axis, int rtype)
+{
+       PyObject *obj1=NULL, *obj2=NULL, *new=NULL;
+       PyObject *ret=NULL, *newshape=NULL;
+       int i, n;
+       intp val;

+       if ((new = _check_axis(self, &axis, 0))==NULL) return NULL;
+
+       /* Compute and reshape mean */
+       obj1 = PyArray_EnsureArray(PyArray_Mean((PyAO *)new, axis, rtype));
+       if (obj1 == NULL) {Py_DECREF(new); return NULL;}
+       n = PyArray_NDIM(new);
+       newshape = PyTuple_New(n);
+       if (newshape == NULL) {Py_DECREF(obj1); Py_DECREF(new); return 
NULL;}
+       for (i=0; i<n; i++) {
+               if (i==axis) val = 1;
+               else val = PyArray_DIM(new,i);
+               PyTuple_SET_ITEM(newshape, i, PyInt_FromLong((long)val));
+       }
+       obj2 = PyArray_Reshape((PyAO *)obj1, newshape);
+       Py_DECREF(obj1);
+       Py_DECREF(newshape);
+       if (obj2 == NULL) {Py_DECREF(new); return NULL;}
+
+       /* Compute x = x - mx */
+       obj1 = PyNumber_Subtract((PyObject *)self, obj2);
+       Py_DECREF(obj2);
+       if (obj1 == NULL) {Py_DECREF(new); return NULL;}
+
+       /* Compute x * x */
+       obj2 = PyArray_EnsureArray(PyNumber_Multiply(obj1, obj1));
+       Py_DECREF(obj1);
+       if (obj2 == NULL) {Py_DECREF(new); return NULL;}
+
+       /* Compute add.reduce(x*x,axis) */
+       obj1 = PyArray_GenericReduceFunction((PyAO *)obj2, n_ops.add,
+                                            axis, rtype);
+       Py_DECREF(obj2);
+       if (obj1 == NULL) {Py_DECREF(new); return NULL;}
+
+       n = PyArray_DIM(new,axis)-1;
+       Py_DECREF(new);
+       if (n<=0) n=1;
+       obj2 = PyFloat_FromDouble(1.0/((double )n));
+       if (obj2 == NULL) {Py_DECREF(obj1); return NULL;}
+       ret = PyArray_EnsureArray(PyNumber_Multiply(obj1, obj2));
+       Py_DECREF(obj1);
+       Py_DECREF(obj2);
+
+       return ret;
+}
+
+
 static PyObject *
 PyArray_Sum(PyArrayObject *self, int axis, int rtype)
 {
Index: base/src/arraymethods.c
===================================================================
--- base/src/arraymethods.c     (revision 1534)
+++ base/src/arraymethods.c     (working copy)
@@ -1225,6 +1225,23 @@
        return PyArray_Std(self, axis, rtype.type_num);
 }

+static char doc_var[] = "a.var(axis=None, rtype=None)";
+
+static PyObject *
+array_var(PyArrayObject *self, PyObject *args, PyObject *kwds)
+{
+       int axis=MAX_DIMS;
+       PyArray_Typecode rtype = {PyArray_NOTYPE, 0, 0};
+       static char *kwlist[] = {"axis", "rtype", NULL};
+
+       if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
+                                        PyArray_AxisConverter,
+                                        &axis, PyArray_TypecodeConverter,
+                                        &rtype)) return NULL;
+
+       return PyArray_Var(self, axis, rtype.type_num);
+}
+
 static char doc_compress[] = "a.compress(condition=, axis=None)";

 static PyObject *
@@ -1501,6 +1518,8 @@
         METH_VARARGS, doc_nonzero},
        {"std", (PyCFunction)array_stddev,
         METH_VARARGS|METH_KEYWORDS, doc_stddev},
+       {"var", (PyCFunction)array_var,
+        METH_VARARGS|METH_KEYWORDS, doc_var},
        {"sum", (PyCFunction)array_sum,
         METH_VARARGS|METH_KEYWORDS, doc_sum},
        {"cumsum", (PyCFunction)array_cumsum,
Index: base/src/scalartypes.inc.src
===================================================================
--- base/src/scalartypes.inc.src        (revision 1534)
+++ base/src/scalartypes.inc.src        (working copy)
@@ -972,7 +972,7 @@

 /**begin repeat

-#name=getfield, take, put, putmask, repeat, tofile, mean, trace, 
diagonal, clip, std, sum, cumsum, prod, cumprod, compress#
+#name=getfield, take, put, putmask, repeat, tofile, mean, trace, 
diagonal, clip, std, var, sum, cumsum, prod, cumprod, compress#
 */

 static PyObject *
@@ -1144,6 +1144,8 @@
         METH_VARARGS, NULL},
        {"std", (PyCFunction)gentype_std,
         METH_VARARGS|METH_KEYWORDS, NULL},
+       {"var", (PyCFunction)gentype_var,
+        METH_VARARGS|METH_KEYWORDS, NULL},
        {"sum", (PyCFunction)gentype_sum,
         METH_VARARGS|METH_KEYWORDS, NULL},
        {"cumsum", (PyCFunction)gentype_cumsum,
Index: base/code_generators/generate_array_api.py
===================================================================
--- base/code_generators/generate_array_api.py  (revision 1534)
+++ base/code_generators/generate_array_api.py  (working copy)
@@ -367,6 +367,10 @@
     """,
      'Std','PyArrayObject *, int, int','PyObject *'),

+    (r"""Var
+    """,
+     'Var','PyArrayObject *, int, int','PyObject *'),
+
     (r"""Sum
     """,
      'Sum','PyArrayObject *, int, int','PyObject *'),