[Scipy-svn] r4758 - in trunk/scipy/cluster: . src tests

scipy-svn at scipy.org scipy-svn at scipy.org
Tue Sep 30 15:15:04 EDT 2008


Author: damian.eads
Date: 2008-09-30 14:14:59 -0500 (Tue, 30 Sep 2008)
New Revision: 4758

Modified:
   trunk/scipy/cluster/distance.py
   trunk/scipy/cluster/hierarchy.py
   trunk/scipy/cluster/src/distance.c
   trunk/scipy/cluster/src/distance.h
   trunk/scipy/cluster/src/distance_wrap.c
   trunk/scipy/cluster/tests/test_distance.py
Log:
Fixed minor bug in cophenet.

Modified: trunk/scipy/cluster/distance.py
===================================================================
--- trunk/scipy/cluster/distance.py	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/distance.py	2008-09-30 19:14:59 UTC (rev 4758)
@@ -199,6 +199,35 @@
         raise ValueError("p must be at least 1")
     return (abs(u-v)**p).sum() ** (1.0 / p)
 
+def wminkowski(u, v, p, w):
+    """
+    Computes the weighted Minkowski distance between two vectors ``u``
+    and ``v``, defined as
+
+    .. math::
+
+       \sum {(w_i*|u_i - v_i|)^p})^(1/p).
+
+    :Parameters:
+       u : ndarray
+           An :math:`n`-dimensional vector.
+       v : ndarray
+           An :math:`n`-dimensional vector.
+       p : ndarray
+           The norm of the difference :math:`${||u-v||}_p$`.
+       w : ndarray
+           The weight vector.
+
+    :Returns:
+       d : double
+           The Minkowski distance between vectors ``u`` and ``v``.
+    """
+    u = np.asarray(u)
+    v = np.asarray(v)
+    if p < 1:
+        raise ValueError("p must be at least 1")
+    return ((w * abs(u-v))**p).sum() ** (1.0 / p)
+
 def euclidean(u, v):
     """
     Computes the Euclidean distance between two n-vectors ``u`` and ``v``,
@@ -817,6 +846,14 @@
            'jaccard', 'kulsinski', 'mahalanobis', 'matching',
            'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
            'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'.
+       w : ndarray
+           The weight vector (for weighted Minkowski).
+       p : double
+           The p-norm to apply (for Minkowski, weighted and unweighted)
+       V : ndarray
+           The variance vector (for standardized Euclidean).
+       VI : ndarray
+           The inverse of the covariance matrix (for Mahalanobis).
 
     :Returns:
        Y : ndarray
@@ -978,6 +1015,11 @@
        Computes the Sokal-Sneath distance between each pair of
        boolean vectors. (see sokalsneath function documentation)
 
+    22. ``Y = pdist(X, 'wminkowski')``
+
+       Computes the weighted Minkowski distance between each pair of
+       vectors. (see wminkowski function documentation)
+
     22. ``Y = pdist(X, f)``
 
        Computes the distance between all pairs of vectors in X
@@ -1036,6 +1078,11 @@
                 for j in xrange(i+1, m):
                     dm[k] = minkowski(X[i, :], X[j, :], p)
                     k = k + 1
+        elif metric == wminkowski:
+            for i in xrange(0, m - 1):
+                for j in xrange(i+1, m):
+                    dm[k] = wminkowski(X[i, :], X[j, :], p, w)
+                    k = k + 1
         elif metric == seuclidean:
             for i in xrange(0, m - 1):
                 for j in xrange(i+1, m):
@@ -1079,6 +1126,9 @@
             _distance_wrap.pdist_chebyshev_wrap(_convert_to_double(X), dm)
         elif mstr in set(['minkowski', 'mi', 'm']):
             _distance_wrap.pdist_minkowski_wrap(_convert_to_double(X), dm, p)
+        elif mstr in set(['wminkowski', 'wmi', 'wm', 'wpnorm']):
+            _distance_wrap.cdist_weighted_minkowski_wrap(_convert_to_double(X),
+                                                         dm, p, w)
         elif mstr in set(['seuclidean', 'se', 's']):
             if V is not None:
                 V = np.asarray(V)
@@ -1174,7 +1224,9 @@
         elif metric == 'test_cityblock':
             dm = pdist(X, cityblock)
         elif metric == 'test_minkowski':
-            dm = pdist(X, minkowski, p)
+            dm = pdist(X, minkowski, p=p)
+        elif metric == 'test_wminkowski':
+            dm = pdist(X, wminkowski, p=p, w=w)
         elif metric == 'test_cosine':
             dm = pdist(X, cosine)
         elif metric == 'test_correlation':
@@ -1502,7 +1554,7 @@
     return d
 
 
-def cdist(XA, XB, metric='euclidean', p=2, V=None, VI=None):
+def cdist(XA, XB, metric='euclidean', p=2, V=None, VI=None, w=None):
     """
     Computes distance between each pair of observations between two
     collections of vectors. ``XA`` is a :math:`$m_A$` by :math:`$n$`
@@ -1529,8 +1581,18 @@
            'correlation', 'cosine', 'dice', 'euclidean', 'hamming',
            'jaccard', 'kulsinski', 'mahalanobis', 'matching',
            'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
-           'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'.
+           'sokalmichener', 'sokalsneath', 'sqeuclidean', 'wminkowski',
+           'yule'.
+       w : ndarray
+           The weight vector (for weighted Minkowski).
+       p : double
+           The p-norm to apply (for Minkowski, weighted and unweighted)
+       V : ndarray
+           The variance vector (for standardized Euclidean).
+       VI : ndarray
+           The inverse of the covariance matrix (for Mahalanobis).
 
+
     :Returns:
        Y : ndarray
            A :math:`$m_A$` by :math:`$m_B$` distance matrix.
@@ -1688,11 +1750,17 @@
 
     21. ``Y = cdist(X, 'sokalsneath')``
 
-       Computes the Sokal-Sneath distance between each pair of
-       boolean vectors. (see sokalsneath function documentation)
+       Computes the Sokal-Sneath distance between the vectors. (see
+       sokalsneath function documentation)
 
-    22. ``Y = cdist(X, f)``
 
+    22. ``Y = cdist(X, 'wminkowski')``
+
+       Computes the weighted Minkowski distance between the
+       vectors. (see sokalsneath function documentation)
+
+    23. ``Y = cdist(X, f)``
+
        Computes the distance between all pairs of vectors in X
        using the user supplied 2-arity function f. For example,
        Euclidean distance between the vectors could be computed
@@ -1755,6 +1823,10 @@
             for i in xrange(0, mA):
                 for j in xrange(0, mB):
                     dm[i, j] = minkowski(XA[i, :], XB[j, :], p)
+        elif metric == wminkowski:
+            for i in xrange(0, mA):
+                for j in xrange(0, mB):
+                    dm[i, j] = wminkowski(XA[i, :], XB[j, :], p, w)
         elif metric == seuclidean:
             for i in xrange(0, mA):
                 for j in xrange(0, mB):
@@ -1800,9 +1872,12 @@
         elif mstr in set(['chebychev', 'chebyshev', 'cheby', 'cheb', 'ch']):
             _distance_wrap.cdist_chebyshev_wrap(_convert_to_double(XA),
                                                 _convert_to_double(XB), dm)
-        elif mstr in set(['minkowski', 'mi', 'm']):
+        elif mstr in set(['minkowski', 'mi', 'm', 'pnorm']):
             _distance_wrap.cdist_minkowski_wrap(_convert_to_double(XA),
                                                 _convert_to_double(XB), dm, p)
+        elif mstr in set(['wminkowski', 'wmi', 'wm', 'wpnorm']):
+            _distance_wrap.cdist_weighted_minkowski_wrap(_convert_to_double(XA),
+                                                         _convert_to_double(XB), dm, p, _convert_to_double(w))
         elif mstr in set(['seuclidean', 'se', 's']):
             if V is not None:
                 V = np.asarray(V)
@@ -1921,7 +1996,9 @@
         elif metric == 'test_cityblock':
             dm = cdist(XA, XB, cityblock)
         elif metric == 'test_minkowski':
-            dm = cdist(XA, XB, minkowski, p)
+            dm = cdist(XA, XB, minkowski, p=p)
+        elif metric == 'test_wminkowski':
+            dm = cdist(XA, XB, wminkowski, p=p, w=w)
         elif metric == 'test_cosine':
             dm = cdist(XA, XB, cosine)
         elif metric == 'test_correlation':

Modified: trunk/scipy/cluster/hierarchy.py
===================================================================
--- trunk/scipy/cluster/hierarchy.py	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/hierarchy.py	2008-09-30 19:14:59 UTC (rev 4758)
@@ -910,14 +910,13 @@
          correlation coefficient and ``d`` is the condensed cophentic
          distance matrix (upper triangular form).
     """
-    Z = np.asarray(Z)
-
     nargs = len(args)
 
     if nargs < 1:
         raise ValueError('At least one argument must be passed to cophenet.')
 
     Z = args[0]
+    Z = np.asarray(Z)
     is_valid_linkage(Z, throw=True, name='Z')
     Zs = Z.shape
     n = Zs[0] + 1
@@ -932,6 +931,7 @@
         return zz
 
     Y = args[1]
+    Y = np.asarray(Y)
     Ys = Y.shape
     distance.is_valid_y(Y, throw=True, name='Y')
 

Modified: trunk/scipy/cluster/src/distance.c
===================================================================
--- trunk/scipy/cluster/src/distance.c	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/src/distance.c	2008-09-30 19:14:59 UTC (rev 4758)
@@ -294,6 +294,16 @@
   return pow(s, 1.0 / p);
 }
 
+double weighted_minkowski_distance(const double *u, const double *v, int n, double p, const double *w) {
+  int i = 0;
+  double s = 0.0, d;
+  for (i = 0; i < n; i++) {
+    d = fabs(u[i] - v[i]) * w[i];
+    s = s + pow(d, p);
+  }
+  return pow(s, 1.0 / p);
+}
+
 void compute_mean_vector(double *res, const double *X, int m, int n) {
   int i, j;
   const double *v;
@@ -489,6 +499,19 @@
   }
 }
 
+void pdist_weighted_minkowski(const double *X, double *dm, int m, int n, double p, const double *w) {
+  int i, j;
+  const double *u, *v;
+  double *it = dm;
+  for (i = 0; i < m; i++) {
+    for (j = i + 1; j < m; j++, it++) {
+      u = X + (n * i);
+      v = X + (n * j);
+      *it = weighted_minkowski_distance(u, v, n, p, w);
+    }
+  }
+}
+
 void pdist_yule_bool(const char *X, double *dm, int m, int n) {
   int i, j;
   const char *u, *v;
@@ -813,6 +836,19 @@
   }
 }
 
+void cdist_weighted_minkowski(const double *XA, const double *XB, double *dm, int mA, int mB, int n, double p, const double *w) {
+  int i, j;
+  const double *u, *v;
+  double *it = dm;
+  for (i = 0; i < mA; i++) {
+    for (j = 0; j < mB; j++, it++) {
+      u = XA + (n * i);
+      v = XB + (n * j);
+      *it = weighted_minkowski_distance(u, v, n, p, w);
+    }
+  }
+}
+
 void cdist_yule_bool(const char *XA, const char *XB, double *dm, int mA, int mB, int n) {
   int i, j;
   const char *u, *v;

Modified: trunk/scipy/cluster/src/distance.h
===================================================================
--- trunk/scipy/cluster/src/distance.h	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/src/distance.h	2008-09-30 19:14:59 UTC (rev 4758)
@@ -55,6 +55,7 @@
 void pdist_jaccard_bool(const char *X, double *dm, int m, int n);
 void pdist_kulsinski_bool(const char *X, double *dm, int m, int n);
 void pdist_minkowski(const double *X, double *dm, int m, int n, double p);
+void pdist_weighted_minkowski(const double *X, double *dm, int m, int n, double p, const double *w);
 void pdist_yule_bool(const char *X, double *dm, int m, int n);
 void pdist_matching_bool(const char *X, double *dm, int m, int n);
 void pdist_dice_bool(const char *X, double *dm, int m, int n);
@@ -93,6 +94,8 @@
 		      int mA, int mB, int n);
 void cdist_minkowski(const double *XA, const double *XB, double *dm,
 		     int mA, int mB, int n, double p);
+void cdist_weighted_minkowski(const double *XA, const double *XB, double *dm,
+			      int mA, int mB, int n, double p, const double *w);
 void cdist_yule_bool(const char *XA, const char *XB, double *dm,
 		     int mA, int mB, int n);
 void cdist_matching_bool(const char *XA, const char *XB, double *dm,

Modified: trunk/scipy/cluster/src/distance_wrap.c
===================================================================
--- trunk/scipy/cluster/src/distance_wrap.c	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/src/distance_wrap.c	2008-09-30 19:14:59 UTC (rev 4758)
@@ -347,12 +347,36 @@
     mA = XA_->dimensions[0];
     mB = XB_->dimensions[0];
     n = XA_->dimensions[1];
-
     cdist_minkowski(XA, XB, dm, mA, mB, n, p);
   }
   return Py_BuildValue("d", 0.0);
 }
 
+extern PyObject *cdist_weighted_minkowski_wrap(PyObject *self, PyObject *args) {
+  PyArrayObject *XA_, *XB_, *dm_, *w_;
+  int mA, mB, n;
+  double *dm;
+  const double *XA, *XB, *w;
+  double p;
+  if (!PyArg_ParseTuple(args, "O!O!O!dO!",
+			&PyArray_Type, &XA_, &PyArray_Type, &XB_, 
+			&PyArray_Type, &dm_,
+			&p,
+			&PyArray_Type, &w_)) {
+    return 0;
+  }
+  else {
+    XA = (const double*)XA_->data;
+    XB = (const double*)XB_->data;
+    w = (const double*)w_->data;
+    dm = (double*)dm_->data;
+    mA = XA_->dimensions[0];
+    mB = XB_->dimensions[0];
+    n = XA_->dimensions[1];
+    cdist_weighted_minkowski(XA, XB, dm, mA, mB, n, p, w);
+  }
+  return Py_BuildValue("d", 0.0);
+}
 
 extern PyObject *cdist_yule_bool_wrap(PyObject *self, PyObject *args) {
   PyArrayObject *XA_, *XB_, *dm_;
@@ -824,7 +848,31 @@
   return Py_BuildValue("d", 0.0);
 }
 
+extern PyObject *pdist_weighted_minkowski_wrap(PyObject *self, PyObject *args) {
+  PyArrayObject *X_, *dm_, *w_;
+  int m, n;
+  double *dm, *X, *w;
+  double p;
+  if (!PyArg_ParseTuple(args, "O!O!dO!",
+			&PyArray_Type, &X_,
+			&PyArray_Type, &dm_,
+			&p,
+			&PyArray_Type, &w_)) {
+    return 0;
+  }
+  else {
+    X = (double*)X_->data;
+    dm = (double*)dm_->data;
+    w = (const double*)w_->data;
+    m = X_->dimensions[0];
+    n = X_->dimensions[1];
 
+    pdist_weighted_minkowski(X, dm, m, n, p, w);
+  }
+  return Py_BuildValue("d", 0.0);
+}
+
+
 extern PyObject *pdist_yule_bool_wrap(PyObject *self, PyObject *args) {
   PyArrayObject *X_, *dm_;
   int m, n;
@@ -1048,6 +1096,7 @@
   {"cdist_mahalanobis_wrap", cdist_mahalanobis_wrap, METH_VARARGS},
   {"cdist_matching_bool_wrap", cdist_matching_bool_wrap, METH_VARARGS},
   {"cdist_minkowski_wrap", cdist_minkowski_wrap, METH_VARARGS},
+  {"cdist_weighted_minkowski_wrap", cdist_weighted_minkowski_wrap, METH_VARARGS},
   {"cdist_rogerstanimoto_bool_wrap", cdist_rogerstanimoto_bool_wrap, METH_VARARGS},
   {"cdist_russellrao_bool_wrap", cdist_russellrao_bool_wrap, METH_VARARGS},
   {"cdist_seuclidean_wrap", cdist_seuclidean_wrap, METH_VARARGS},
@@ -1069,6 +1118,7 @@
   {"pdist_mahalanobis_wrap", pdist_mahalanobis_wrap, METH_VARARGS},
   {"pdist_matching_bool_wrap", pdist_matching_bool_wrap, METH_VARARGS},
   {"pdist_minkowski_wrap", pdist_minkowski_wrap, METH_VARARGS},
+  {"pdist_weighted_minkowski_wrap", pdist_weighted_minkowski_wrap, METH_VARARGS},
   {"pdist_rogerstanimoto_bool_wrap", pdist_rogerstanimoto_bool_wrap, METH_VARARGS},
   {"pdist_russellrao_bool_wrap", pdist_russellrao_bool_wrap, METH_VARARGS},
   {"pdist_seuclidean_wrap", pdist_seuclidean_wrap, METH_VARARGS},

Modified: trunk/scipy/cluster/tests/test_distance.py
===================================================================
--- trunk/scipy/cluster/tests/test_distance.py	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/tests/test_distance.py	2008-09-30 19:14:59 UTC (rev 4758)
@@ -233,6 +233,44 @@
             print (Y1-Y2).max()
         self.failUnless(within_tol(Y1, Y2, eps))
 
+
+    def test_cdist_wminkowski_random_p3d8(self):
+        "Tests cdist(X, 'wminkowski') on random data. (p=3.8)"
+        eps = 1e-07
+        # Get the data: the input matrix and the right output.
+        X1 = eo['cdist-X1']
+        X2 = eo['cdist-X2']
+        w = 1.0 / X1.std(axis=0)
+        Y1 = cdist(X1, X2, 'wminkowski', p=3.8, w=w)
+        Y2 = cdist(X1, X2, 'test_wminkowski', p=3.8, w=w)
+        print (Y1-Y2).max()
+        self.failUnless(within_tol(Y1, Y2, eps))
+
+    def test_cdist_wminkowski_random_p4d6(self):
+        "Tests cdist(X, 'wminkowski') on random data. (p=4.6)"
+        eps = 1e-07
+        # Get the data: the input matrix and the right output.
+        X1 = eo['cdist-X1']
+        X2 = eo['cdist-X2']
+        w = 1.0 / X1.std(axis=0)
+        Y1 = cdist(X1, X2, 'wminkowski', p=4.6, w=w)
+        Y2 = cdist(X1, X2, 'test_wminkowski', p=4.6, w=w)
+        print (Y1-Y2).max()
+        self.failUnless(within_tol(Y1, Y2, eps))
+
+    def test_cdist_wminkowski_random_p1d23(self):
+        "Tests cdist(X, 'wminkowski') on random data. (p=1.23)"
+        eps = 1e-07
+        # Get the data: the input matrix and the right output.
+        X1 = eo['cdist-X1']
+        X2 = eo['cdist-X2']
+        w = 1.0 / X1.std(axis=0)
+        Y1 = cdist(X1, X2, 'wminkowski', p=1.23, w=w)
+        Y2 = cdist(X1, X2, 'test_wminkowski', p=1.23, w=w)
+        print (Y1-Y2).max()
+        self.failUnless(within_tol(Y1, Y2, eps))
+
+
     def test_cdist_seuclidean_random(self):
         "Tests cdist(X, 'seuclidean') on random data."
         eps = 1e-07




More information about the Scipy-svn mailing list