[Scipy-svn] r4666 - in trunk/scipy/cluster: . src tests
scipy-svn at scipy.org
scipy-svn at scipy.org
Sat Aug 23 01:21:16 EDT 2008
Author: damian.eads
Date: 2008-08-23 00:21:05 -0500 (Sat, 23 Aug 2008)
New Revision: 4666
Modified:
trunk/scipy/cluster/distance.py
trunk/scipy/cluster/hierarchy.py
trunk/scipy/cluster/src/distance.c
trunk/scipy/cluster/src/distance.h
trunk/scipy/cluster/src/distance_wrap.c
trunk/scipy/cluster/src/hierarchy.c
trunk/scipy/cluster/src/hierarchy.h
trunk/scipy/cluster/src/hierarchy_wrap.c
trunk/scipy/cluster/tests/test_distance.py
trunk/scipy/cluster/tests/test_hierarchy.py
Log:
Moved some remaining distance-related functions from scipy.distance to scipy.cluster.
Modified: trunk/scipy/cluster/distance.py
===================================================================
--- trunk/scipy/cluster/distance.py 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/distance.py 2008-08-23 05:21:05 UTC (rev 4666)
@@ -7,36 +7,74 @@
stored in a rectangular array.
+------------------+-------------------------------------------------+
+|*Function* | *Description* |
++------------------+-------------------------------------------------+
|pdist | computes distances between observation pairs. |
+------------------+-------------------------------------------------+
+|squareform | converts a square distance matrix to a |
+| | condensed one and vice versa. |
++------------------+-------------------------------------------------+
+Predicates for checking the validity of distance matrices, both
+condensed and redundant.
+
++------------------+-------------------------------------------------+
+|*Function* | *Description* |
++------------------+-------------------------------------------------+
+|is_valid_dm | checks for a valid distance matrix. |
++------------------+-------------------------------------------------+
+|is_valid_y | checks for a valid condensed distance matrix.
++------------------+-------------------------------------------------+
+
Distance functions between two vectors ``u`` and ``v``. Computing
distances over a large collection of vectors is inefficient for these
functions. Use ``pdist`` for this purpose.
+------------------+-------------------------------------------------+
-|braycurtis | the Bray-Curtis distance. |
-|canberra | the Canberra distance. |
-|chebyshev | the Chebyshev distance. |
-|cityblock | the Manhattan distance. |
-|correlation | the Correlation distance. |
-|cosine | the Cosine distance. |
-|dice | the Dice dissimilarity (boolean). |
-|euclidean | the Euclidean distance. |
-|hamming | the Hamming distance (boolean). |
-|jaccard | the Jaccard distance (boolean). |
-|kulsinski | the Kulsinski distance (boolean). |
-|mahalanobis | the Mahalanobis distance. |
-|matching | the matching dissimilarity (boolean). |
-|minkowski | the Minkowski distance. |
-|rogerstanimoto | the Rogers-Tanimoto dissimilarity (boolean). |
-|russellrao | the Russell-Rao dissimilarity (boolean). |
-|seuclidean | the normalized Euclidean distance. |
-|sokalmichener | the Sokal-Michener dissimilarity (boolean). |
-|sokalsneath | the Sokal-Sneath dissimilarity (boolean). |
-|sqeuclidean | the squared Euclidean distance. |
-|yule | the Yule dissimilarity (boolean). |
+|*Function* | *Description* |
+------------------+-------------------------------------------------+
+| braycurtis | the Bray-Curtis distance. |
++------------------+-------------------------------------------------+
+| canberra | the Canberra distance. |
++------------------+-------------------------------------------------+
+| chebyshev | the Chebyshev distance. |
++------------------+-------------------------------------------------+
+| cityblock | the Manhattan distance. |
++------------------+-------------------------------------------------+
+| correlation | the Correlation distance. |
++------------------+-------------------------------------------------+
+| cosine | the Cosine distance. |
++------------------+-------------------------------------------------+
+| dice | the Dice dissimilarity (boolean). |
++------------------+-------------------------------------------------+
+| euclidean | the Euclidean distance. |
++------------------+-------------------------------------------------+
+| hamming | the Hamming distance (boolean). |
++------------------+-------------------------------------------------+
+| jaccard | the Jaccard distance (boolean). |
++------------------+-------------------------------------------------+
+| kulsinski | the Kulsinski distance (boolean). |
++------------------+-------------------------------------------------+
+| mahalanobis | the Mahalanobis distance. |
++------------------+-------------------------------------------------+
+| matching | the matching dissimilarity (boolean). |
++------------------+-------------------------------------------------+
+| minkowski | the Minkowski distance. |
++------------------+-------------------------------------------------+
+| rogerstanimoto | the Rogers-Tanimoto dissimilarity (boolean). |
++------------------+-------------------------------------------------+
+| russellrao | the Russell-Rao dissimilarity (boolean). |
++------------------+-------------------------------------------------+
+| seuclidean | the normalized Euclidean distance. |
++------------------+-------------------------------------------------+
+| sokalmichener | the Sokal-Michener dissimilarity (boolean). |
++------------------+-------------------------------------------------+
+| sokalsneath | the Sokal-Sneath dissimilarity (boolean). |
++------------------+-------------------------------------------------+
+| sqeuclidean | the squared Euclidean distance. |
++------------------+-------------------------------------------------+
+| yule | the Yule dissimilarity (boolean). |
++------------------+-------------------------------------------------+
References
@@ -1161,3 +1199,296 @@
else:
raise TypeError('2nd argument metric must be a string identifier or a function.')
return dm
+def squareform(X, force="no", checks=True):
+ """
+ Converts a vector-form distance vector to a square-form distance
+ matrix, and vice-versa.
+
+ :Parameters:
+ X : ndarray
+ Either a condensed or redundant distance matrix.
+
+ :Returns:
+ Y : ndarray
+ If a condensed distance matrix is passed, a redundant
+ one is returned, or if a redundant one is passed, a
+ condensed distance matrix is returned.
+
+ force : string
+ As with MATLAB(TM), if force is equal to 'tovector' or
+ 'tomatrix', the input will be treated as a distance matrix
+ or distance vector respectively.
+
+ checks : bool
+ If ``checks`` is set to ``False``, no checks will be made
+ for matrix symmetry nor zero diagonals. This is useful if
+ it is known that ``X - X.T1`` is small and ``diag(X)`` is
+ close to zero. These values are ignored any way so they do
+ not disrupt the squareform transformation.
+
+
+ Calling Conventions
+ -------------------
+
+ 1. v = squareform(X)
+
+ Given a square d by d symmetric distance matrix ``X``,
+ ``v=squareform(X)`` returns a :math:`$d*(d-1)/2$` (or
+ `${n \choose 2}$`) sized vector v.
+
+ v[{n \choose 2}-{n-i \choose 2} + (j-i-1)] is the distance
+ between points i and j. If X is non-square or asymmetric, an error
+ is returned.
+
+ X = squareform(v)
+
+ Given a d*d(-1)/2 sized v for some integer d>=2 encoding distances
+ as described, X=squareform(v) returns a d by d distance matrix X. The
+ X[i, j] and X[j, i] values are set to
+ v[{n \choose 2}-{n-i \choose 2} + (j-u-1)] and all
+ diagonal elements are zero.
+
+ """
+
+ X = _convert_to_double(np.asarray(X))
+
+ if not np.issubsctype(X, np.double):
+ raise TypeError('A double array must be passed.')
+
+ s = X.shape
+
+ # X = squareform(v)
+ if len(s) == 1 and force != 'tomatrix':
+ if X.shape[0] == 0:
+ return np.zeros((1,1), dtype=np.double)
+
+ # Grab the closest value to the square root of the number
+ # of elements times 2 to see if the number of elements
+ # is indeed a binomial coefficient.
+ d = int(np.ceil(np.sqrt(X.shape[0] * 2)))
+
+ # Check that v is of valid dimensions.
+ if d * (d - 1) / 2 != int(s[0]):
+ raise ValueError('Incompatible vector size. It must be a binomial coefficient n choose 2 for some integer n >= 2.')
+
+ # Allocate memory for the distance matrix.
+ M = np.zeros((d, d), dtype=np.double)
+
+ # Since the C code does not support striding using strides.
+ # The dimensions are used instead.
+ [X] = _copy_arrays_if_base_present([X])
+
+ # Fill in the values of the distance matrix.
+ _distance_wrap.to_squareform_from_vector_wrap(M, X)
+
+ # Return the distance matrix.
+ M = M + M.transpose()
+ return M
+ elif len(s) != 1 and force.lower() == 'tomatrix':
+ raise ValueError("Forcing 'tomatrix' but input X is not a distance vector.")
+ elif len(s) == 2 and force.lower() != 'tovector':
+ if s[0] != s[1]:
+ raise ValueError('The matrix argument must be square.')
+ if checks:
+ if np.sum(np.sum(X == X.transpose())) != np.product(X.shape):
+ raise ValueError('The distance matrix array must be symmetrical.')
+ if (X.diagonal() != 0).any():
+ raise ValueError('The distance matrix array must have zeros along the diagonal.')
+
+ # One-side of the dimensions is set here.
+ d = s[0]
+
+ if d <= 1:
+ return np.array([], dtype=np.double)
+
+ # Create a vector.
+ v = np.zeros(((d * (d - 1) / 2),), dtype=np.double)
+
+ # Since the C code does not support striding using strides.
+ # The dimensions are used instead.
+ [X] = _copy_arrays_if_base_present([X])
+
+ # Convert the vector to squareform.
+ _distance_wrap.to_vector_from_squareform_wrap(X, v)
+ return v
+ elif len(s) != 2 and force.lower() == 'tomatrix':
+ raise ValueError("Forcing 'tomatrix' but input X is not a distance vector.")
+ else:
+ raise ValueError('The first argument must be one or two dimensional array. A %d-dimensional array is not permitted' % len(s))
+
+def is_valid_dm(D, tol=0.0, throw=False, name="D", warning=False):
+ """
+ Returns True if the variable D passed is a valid distance matrix.
+ Distance matrices must be 2-dimensional numpy arrays containing
+ doubles. They must have a zero-diagonal, and they must be symmetric.
+
+ :Parameters:
+ D : ndarray
+ The candidate object to test for validity.
+ tol : double
+ The distance matrix should be symmetric. tol is the maximum
+ difference between the :math:`$ij$`th entry and the
+ :math:`$ji$`th entry for the distance metric to be
+ considered symmetric.
+ throw : bool
+ An exception is thrown if the distance matrix passed is not
+ valid.
+ name : string
+ the name of the variable to checked. This is useful ifa
+ throw is set to ``True`` so the offending variable can be
+ identified in the exception message when an exception is
+ thrown.
+ warning : boolx
+ Instead of throwing an exception, a warning message is
+ raised.
+
+ :Returns:
+ Returns ``True`` if the variable ``D`` passed is a valid
+ distance matrix. Small numerical differences in ``D`` and
+ ``D.T`` and non-zeroness of the diagonal are ignored if they are
+ within the tolerance specified by ``tol``.
+ """
+ D = np.asarray(D)
+ valid = True
+ try:
+ if type(D) != np.ndarray:
+ if name:
+ raise TypeError('\'%s\' passed as a distance matrix is not a numpy array.' % name)
+ else:
+ raise TypeError('Variable is not a numpy array.')
+ s = D.shape
+ if D.dtype != np.double:
+ if name:
+ raise TypeError('Distance matrix \'%s\' must contain doubles (double).' % name)
+ else:
+ raise TypeError('Distance matrix must contain doubles (double).')
+ if len(D.shape) != 2:
+ if name:
+ raise ValueError('Distance matrix \'%s\' must have shape=2 (i.e. be two-dimensional).' % name)
+ else:
+ raise ValueError('Distance matrix must have shape=2 (i.e. be two-dimensional).')
+ if tol == 0.0:
+ if not (D == D.T).all():
+ if name:
+ raise ValueError('Distance matrix \'%s\' must be symmetric.' % name)
+ else:
+ raise ValueError('Distance matrix must be symmetric.')
+ if not (D[xrange(0, s[0]), xrange(0, s[0])] == 0).all():
+ if name:
+ raise ValueError('Distance matrix \'%s\' diagonal must be zero.' % name)
+ else:
+ raise ValueError('Distance matrix diagonal must be zero.')
+ else:
+ if not (D - D.T <= tol).all():
+ if name:
+ raise ValueError('Distance matrix \'%s\' must be symmetric within tolerance %d.' % (name, tol))
+ else:
+ raise ValueError('Distance matrix must be symmetric within tolerance %5.5f.' % tol)
+ if not (D[xrange(0, s[0]), xrange(0, s[0])] <= tol).all():
+ if name:
+ raise ValueError('Distance matrix \'%s\' diagonal must be close to zero within tolerance %5.5f.' % (name, tol))
+ else:
+ raise ValueError('Distance matrix \'%s\' diagonal must be close to zero within tolerance %5.5f.' % tol)
+ except Exception, e:
+ if throw:
+ raise
+ if warning:
+ _warning(str(e))
+ valid = False
+ return valid
+
+def is_valid_y(y, warning=False, throw=False, name=None):
+ """
+ Returns ``True`` if the variable ``y`` passed is a valid condensed
+ distance matrix. Condensed distance matrices must be 1-dimensional
+ numpy arrays containing doubles. Their length must be a binomial
+ coefficient :math:`${n \choose 2}$` for some positive integer n.
+
+
+ :Parameters:
+ y : ndarray
+ The condensed distance matrix.
+
+ warning : bool
+ Invokes a warning if the variable passed is not a valid
+ condensed distance matrix. The warning message explains why
+ the distance matrix is not valid. 'name' is used when
+ referencing the offending variable.
+
+ throws : throw
+ Throws an exception if the variable passed is not a valid
+ condensed distance matrix.
+
+ name : bool
+ Used when referencing the offending variable in the
+ warning or exception message.
+
+ """
+ y = np.asarray(y)
+ valid = True
+ try:
+ if type(y) != np.ndarray:
+ if name:
+ raise TypeError('\'%s\' passed as a condensed distance matrix is not a numpy array.' % name)
+ else:
+ raise TypeError('Variable is not a numpy array.')
+ if y.dtype != np.double:
+ if name:
+ raise TypeError('Condensed distance matrix \'%s\' must contain doubles (double).' % name)
+ else:
+ raise TypeError('Condensed distance matrix must contain doubles (double).')
+ if len(y.shape) != 1:
+ if name:
+ raise ValueError('Condensed distance matrix \'%s\' must have shape=1 (i.e. be one-dimensional).' % name)
+ else:
+ raise ValueError('Condensed distance matrix must have shape=1 (i.e. be one-dimensional).')
+ n = y.shape[0]
+ d = int(np.ceil(np.sqrt(n * 2)))
+ if (d*(d-1)/2) != n:
+ if name:
+ raise ValueError('Length n of condensed distance matrix \'%s\' must be a binomial coefficient, i.e. there must be a k such that (k \choose 2)=n)!' % name)
+ else:
+ raise ValueError('Length n of condensed distance matrix must be a binomial coefficient, i.e. there must be a k such that (k \choose 2)=n)!')
+ except Exception, e:
+ if throw:
+ raise
+ if warning:
+ _warning(str(e))
+ valid = False
+ return valid
+
+def numobs_dm(D):
+ """
+ Returns the number of original observations that correspond to a
+ square, redudant distance matrix D.
+
+ :Parameters:
+ D : ndarray
+ The target distance matrix.
+
+ :Returns:
+ The number of observations in the redundant distance matrix.
+ """
+ D = np.asarray(D)
+ is_valid_dm(D, tol=np.inf, throw=True, name='D')
+ return D.shape[0]
+
+def numobs_y(Y):
+ """
+ Returns the number of original observations that correspond to a
+ condensed distance matrix Y.
+
+ :Parameters:
+ Y : ndarray
+ The number of original observations in the condensed
+ observation ``Y``.
+
+ :Returns:
+ n : int
+ The number of observations in the condensed distance matrix
+ passed.
+ """
+ Y = np.asarray(Y)
+ is_valid_y(Y, throw=True, name='Y')
+ d = int(np.ceil(np.sqrt(Y.shape[0] * 2)))
+ return d
Modified: trunk/scipy/cluster/hierarchy.py
===================================================================
--- trunk/scipy/cluster/hierarchy.py 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/hierarchy.py 2008-08-23 05:21:05 UTC (rev 4666)
@@ -22,8 +22,6 @@
median the median/WPGMC algorithm. (alias)
ward the Ward/incremental algorithm. (alias)
- squareform converts a sq. D.M. to a condensed one and vice versa.
-
Statistic computations on hierarchies
cophenet computes the cophenetic distance between leaves.
@@ -46,10 +44,8 @@
Predicates
- is_valid_dm checks for a valid distance matrix.
is_valid_im checks for a valid inconsistency matrix.
is_valid_linkage checks for a valid hierarchical clustering.
- is_valid_y checks for a valid condensed distance matrix.
is_isomorphic checks if two flat clusterings are isomorphic.
is_monotonic checks if a linkage is monotonic.
Z_y_correspond checks for validity of distance matrix given a linkage.
@@ -57,8 +53,6 @@
Utility Functions
numobs_dm # of observations in a distance matrix.
- numobs_linkage # of observations in a linkage.
- numobs_y # of observations in a condensed distance matrix.
Legal stuff
@@ -186,12 +180,6 @@
l = [_copy_array_if_base_present(a) for a in T]
return l
-
-def copying():
- """ Displays the license for this package."""
- print _copyingtxt
- return None
-
def _randdm(pnts):
""" Generates a random distance matrix stored in condensed form. A
pnts * (pnts - 1) / 2 sized vector is returned.
@@ -204,53 +192,68 @@
def single(y):
"""
- Z = single(y)
-
Performs single/min/nearest linkage on the condensed distance
- matrix Z. See linkage for more information on the return structure
- and algorithm.
+ matrix ``y``. See ``linkage`` for more information on the return
+ structure and algorithm.
- (a condensed alias for single)
+ :Parameters:
+ y : ndarray
+ The upper triangular of the distance matrix y. The result of
+ ``pdist`` is returned in this form.
+
+ :Returns:
+ Z : ndarray
+ The linkage matrix.
+
"""
return linkage(y, method='single', metric='euclidean')
def complete(y):
"""
- Z = complete(y)
-
Performs complete complete/max/farthest point linkage on the
- condensed distance matrix Z. See linkage for more information
- on the return structure and algorithm.
+ condensed distance matrix ``y``. See ``linkage`` for more
+ information on the return structure and algorithm.
- (a condensed alias for linkage)
+ :Parameters:
+ y : ndarray
+ The upper triangular of the distance matrix y. The result of
+ ``pdist`` is returned in this form.
+
"""
return linkage(y, method='complete', metric='euclidean')
def average(y):
"""
- Z = average(y)
+ Performs average/UPGMA linkage on the condensed distance matrix
+ ``y``. See ``linkage`` for more information on the return
+ structure and algorithm.
- Performs average/UPGMA linkage on the condensed distance matrix Z. See
- linkage for more information on the return structure and algorithm.
-
- (a condensed alias for linkage)
+ :Parameters:
+ y : ndarray
+ The upper triangular of the distance matrix y. The result of
+ ``pdist`` is returned in this form.
"""
return linkage(y, method='average', metric='euclidean')
def weighted(y):
"""
- Z = weighted(y)
+ Performs weighted/WPGMA linkage on the condensed distance matrix
+ ``y``. See ``linkage`` for more information on the return
+ structure and algorithm.
- Performs weighted/WPGMA linkage on the condensed distance matrix Z.
- See linkage for more information on the return structure and
- algorithm.
-
- (a condensed alias for linkage)
+ :Parameters:
+ y : ndarray
+ The upper triangular of the distance matrix y. The result of
+ ``pdist`` is returned in this form.
"""
return linkage(y, method='weighted', metric='euclidean')
def centroid(y):
"""
+ Performs centroid/UPGMC linkage. See ``linkage`` for more
+ information on the return structure and algorithm.
+
+
Z = centroid(y)
Performs centroid/UPGMC linkage on the condensed distance matrix Z.
@@ -652,107 +655,6 @@
else:
return nd
-def squareform(X, force="no", checks=True):
- """
- ... = squareform(...)
-
- Converts a vector-form distance vector to a square-form distance
- matrix, and vice-versa.
-
- v = squareform(X)
-
- Given a square d by d symmetric distance matrix X, v=squareform(X)
- returns a d*(d-1)/2 (or {n \choose 2}) sized vector v.
-
- v[{n \choose 2}-{n-i \choose 2} + (j-i-1)] is the distance
- between points i and j. If X is non-square or asymmetric, an error
- is returned.
-
- X = squareform(v)
-
- Given a d*d(-1)/2 sized v for some integer d>=2 encoding distances
- as described, X=squareform(v) returns a d by d distance matrix X. The
- X[i, j] and X[j, i] values are set to
- v[{n \choose 2}-{n-i \choose 2} + (j-u-1)] and all
- diagonal elements are zero.
-
- As with MATLAB(TM), if force is equal to 'tovector' or 'tomatrix',
- the input will be treated as a distance matrix or distance vector
- respectively.
-
- If checks is set to False, no checks will be made for matrix
- symmetry nor zero diagonals. This is useful if it is known that
- X - X.T is small and diag(X) is close to zero. These values are
- ignored any way so they do not disrupt the squareform
- transformation.
- """
-
- X = _convert_to_double(np.asarray(X))
-
- if not np.issubsctype(X, np.double):
- raise TypeError('A double array must be passed.')
-
- s = X.shape
-
- # X = squareform(v)
- if len(s) == 1 and force != 'tomatrix':
- if X.shape[0] == 0:
- return np.zeros((1,1), dtype=np.double)
-
- # Grab the closest value to the square root of the number
- # of elements times 2 to see if the number of elements
- # is indeed a binomial coefficient.
- d = int(np.ceil(np.sqrt(X.shape[0] * 2)))
-
- # Check that v is of valid dimensions.
- if d * (d - 1) / 2 != int(s[0]):
- raise ValueError('Incompatible vector size. It must be a binomial coefficient n choose 2 for some integer n >= 2.')
-
- # Allocate memory for the distance matrix.
- M = np.zeros((d, d), dtype=np.double)
-
- # Since the C code does not support striding using strides.
- # The dimensions are used instead.
- [X] = _copy_arrays_if_base_present([X])
-
- # Fill in the values of the distance matrix.
- _hierarchy_wrap.to_squareform_from_vector_wrap(M, X)
-
- # Return the distance matrix.
- M = M + M.transpose()
- return M
- elif len(s) != 1 and force.lower() == 'tomatrix':
- raise ValueError("Forcing 'tomatrix' but input X is not a distance vector.")
- elif len(s) == 2 and force.lower() != 'tovector':
- if s[0] != s[1]:
- raise ValueError('The matrix argument must be square.')
- if checks:
- if np.sum(np.sum(X == X.transpose())) != np.product(X.shape):
- raise ValueError('The distance matrix array must be symmetrical.')
- if (X.diagonal() != 0).any():
- raise ValueError('The distance matrix array must have zeros along the diagonal.')
-
- # One-side of the dimensions is set here.
- d = s[0]
-
- if d <= 1:
- return np.array([], dtype=np.double)
-
- # Create a vector.
- v = np.zeros(((d * (d - 1) / 2),), dtype=np.double)
-
- # Since the C code does not support striding using strides.
- # The dimensions are used instead.
- [X] = _copy_arrays_if_base_present([X])
-
- # Convert the vector to squareform.
- _hierarchy_wrap.to_vector_from_squareform_wrap(X, v)
- return v
- elif len(s) != 2 and force.lower() == 'tomatrix':
- raise ValueError("Forcing 'tomatrix' but input X is not a distance vector.")
- else:
- raise ValueError('The first argument must be one or two dimensional array. A %d-dimensional array is not permitted' % len(s))
-
def _convert_to_bool(X):
if X.dtype != np.bool:
X = np.bool_(X)
@@ -1035,140 +937,6 @@
valid = False
return valid
-def is_valid_y(y, warning=False, throw=False, name=None):
- """
- is_valid_y(y)
-
- Returns True if the variable y passed is a valid condensed
- distance matrix. Condensed distance matrices must be
- 1-dimensional numpy arrays containing doubles. Their length
- must be a binomial coefficient {n \choose 2} for some positive
- integer n.
-
- is_valid_y(..., warning=True, name='V')
-
- Invokes a warning if the variable passed is not a valid condensed distance
- matrix. The warning message explains why the distance matrix is not valid.
- 'name' is used when referencing the offending variable.
-
- is_valid_y(..., throw=True, name='V')
-
- Throws an exception if the variable passed is not a valid condensed distance
- matrix. The message explains why variable is not valid. 'name' is used when
- referencing the offending variable.
-
- """
- y = np.asarray(y)
- valid = True
- try:
- if type(y) != np.ndarray:
- if name:
- raise TypeError('\'%s\' passed as a condensed distance matrix is not a numpy array.' % name)
- else:
- raise TypeError('Variable is not a numpy array.')
- if y.dtype != np.double:
- if name:
- raise TypeError('Condensed distance matrix \'%s\' must contain doubles (double).' % name)
- else:
- raise TypeError('Condensed distance matrix must contain doubles (double).')
- if len(y.shape) != 1:
- if name:
- raise ValueError('Condensed distance matrix \'%s\' must have shape=1 (i.e. be one-dimensional).' % name)
- else:
- raise ValueError('Condensed distance matrix must have shape=1 (i.e. be one-dimensional).')
- n = y.shape[0]
- d = int(np.ceil(np.sqrt(n * 2)))
- if (d*(d-1)/2) != n:
- if name:
- raise ValueError('Length n of condensed distance matrix \'%s\' must be a binomial coefficient, i.e. there must be a k such that (k \choose 2)=n)!' % name)
- else:
- raise ValueError('Length n of condensed distance matrix must be a binomial coefficient, i.e. there must be a k such that (k \choose 2)=n)!')
- except Exception, e:
- if throw:
- raise
- if warning:
- _warning(str(e))
- valid = False
- return valid
-
-
-def is_valid_dm(D, tol=0.0, throw=False, name="D"):
- """
- is_valid_dm(D)
-
- Returns True if the variable D passed is a valid distance matrix.
- Distance matrices must be 2-dimensional numpy arrays containing
- doubles. They must have a zero-diagonal, and they must be symmetric.
-
- is_valid_dm(D, tol)
-
- Returns True if the variable D passed is a valid distance matrix.
- Small numerical differences in D and D.T and non-zeroness of the
- diagonal are ignored if they are within the tolerance specified
- by tol.
-
- is_valid_dm(..., warning=True, name='V')
-
- Invokes a warning if the variable passed is not a valid distance matrix.
- The warning message explains why the distance matrix is not valid. 'name'
- is used when referencing the offending variable.
-
- is_valid_dm(..., throw=True, name='V')
-
- Throws an exception if the varible passed is not valid. The message
- explains why the variable is not valid. 'name' is used when referencing
- the offending variable.
-
- """
- D = np.asarray(D)
- valid = True
- try:
- if type(D) != np.ndarray:
- if name:
- raise TypeError('\'%s\' passed as a distance matrix is not a numpy array.' % name)
- else:
- raise TypeError('Variable is not a numpy array.')
- s = D.shape
- if D.dtype != np.double:
- if name:
- raise TypeError('Distance matrix \'%s\' must contain doubles (double).' % name)
- else:
- raise TypeError('Distance matrix must contain doubles (double).')
- if len(D.shape) != 2:
- if name:
- raise ValueError('Distance matrix \'%s\' must have shape=2 (i.e. be two-dimensional).' % name)
- else:
- raise ValueError('Distance matrix must have shape=2 (i.e. be two-dimensional).')
- if tol == 0.0:
- if not (D == D.T).all():
- if name:
- raise ValueError('Distance matrix \'%s\' must be symmetric.' % name)
- else:
- raise ValueError('Distance matrix must be symmetric.')
- if not (D[xrange(0, s[0]), xrange(0, s[0])] == 0).all():
- if name:
- raise ValueError('Distance matrix \'%s\' diagonal must be zero.' % name)
- else:
- raise ValueError('Distance matrix diagonal must be zero.')
- else:
- if not (D - D.T <= tol).all():
- if name:
- raise ValueError('Distance matrix \'%s\' must be symmetric within tolerance %d.' % (name, tol))
- else:
- raise ValueError('Distance matrix must be symmetric within tolerance %5.5f.' % tol)
- if not (D[xrange(0, s[0]), xrange(0, s[0])] <= tol).all():
- if name:
- raise ValueError('Distance matrix \'%s\' diagonal must be close to zero within tolerance %5.5f.' % (name, tol))
- else:
- raise ValueError('Distance matrix \'%s\' diagonal must be close to zero within tolerance %5.5f.' % tol)
- except Exception, e:
- if throw:
- raise
- if warning:
- _warning(str(e))
- valid = False
- return valid
-
def numobs_linkage(Z):
"""
Returns the number of original observations that correspond to a
@@ -1178,29 +946,6 @@
is_valid_linkage(Z, throw=True, name='Z')
return (Z.shape[0] + 1)
-def numobs_dm(D):
- """
- numobs_dm(D)
-
- Returns the number of original observations that correspond to a
- square, non-condensed distance matrix D.
- """
- D = np.asarray(D)
- is_valid_dm(D, tol=np.inf, throw=True, name='D')
- return D.shape[0]
-
-def numobs_y(Y):
- """
- numobs_y(Y)
-
- Returns the number of original observations that correspond to a
- condensed distance matrix Y.
- """
- Y = np.asarray(Y)
- is_valid_y(Y, throw=True, name='Y')
- d = int(np.ceil(np.sqrt(Y.shape[0] * 2)))
- return d
-
def Z_y_correspond(Z, Y):
"""
yesno = Z_y_correspond(Z, Y)
@@ -1213,7 +958,7 @@
"""
Z = np.asarray(Z)
Y = np.asarray(Y)
- return numobs_y(Y) == numobs_Z(Z)
+ return numobs_y(Y) == numobs_linkage(Z)
def fcluster(Z, t, criterion='inconsistent', depth=2, R=None, monocrit=None):
"""
Modified: trunk/scipy/cluster/src/distance.c
===================================================================
--- trunk/scipy/cluster/src/distance.c 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/src/distance.c 2008-08-23 05:21:05 UTC (rev 4666)
@@ -590,3 +590,29 @@
}
}
}
+
+void dist_to_squareform_from_vector(double *M, const double *v, int n) {
+ double *it;
+ const double *cit;
+ int i, j;
+ cit = v;
+ for (i = 0; i < n - 1; i++) {
+ it = M + (i * n) + i + 1;
+ for (j = i + 1; j < n; j++, it++, cit++) {
+ *it = *cit;
+ }
+ }
+}
+
+void dist_to_vector_from_squareform(const double *M, double *v, int n) {
+ double *it;
+ const double *cit;
+ int i, j;
+ it = v;
+ for (i = 0; i < n - 1; i++) {
+ cit = M + (i * n) + i + 1;
+ for (j = i + 1; j < n; j++, it++, cit++) {
+ *it = *cit;
+ }
+ }
+}
Modified: trunk/scipy/cluster/src/distance.h
===================================================================
--- trunk/scipy/cluster/src/distance.h 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/src/distance.h 2008-08-23 05:21:05 UTC (rev 4666)
@@ -37,6 +37,8 @@
#ifndef _CPY_DISTANCE_H
#define _CPY_DISTANCE_H
+void dist_to_squareform_from_vector(double *M, const double *v, int n);
+void dist_to_vector_from_squareform(const double *M, double *v, int n);
void pdist_euclidean(const double *X, double *dm, int m, int n);
void pdist_seuclidean(const double *X,
const double *var, double *dm, int m, int n);
Modified: trunk/scipy/cluster/src/distance_wrap.c
===================================================================
--- trunk/scipy/cluster/src/distance_wrap.c 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/src/distance_wrap.c 2008-08-23 05:21:05 UTC (rev 4666)
@@ -493,7 +493,45 @@
return Py_BuildValue("");
}
+extern PyObject *to_squareform_from_vector_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *M_, *v_;
+ int n;
+ const double *v;
+ double *M;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &M_,
+ &PyArray_Type, &v_)) {
+ return 0;
+ }
+ else {
+ M = (double*)M_->data;
+ v = (const double*)v_->data;
+ n = M_->dimensions[0];
+ dist_to_squareform_from_vector(M, v, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+extern PyObject *to_vector_from_squareform_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *M_, *v_;
+ int n;
+ double *v;
+ const double *M;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &M_,
+ &PyArray_Type, &v_)) {
+ return 0;
+ }
+ else {
+ M = (const double*)M_->data;
+ v = (double*)v_->data;
+ n = M_->dimensions[0];
+ dist_to_vector_from_squareform(M, v, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+
static PyMethodDef _distanceWrapMethods[] = {
{"pdist_bray_curtis_wrap", pdist_bray_curtis_wrap, METH_VARARGS},
{"pdist_canberra_wrap", pdist_canberra_wrap, METH_VARARGS},
@@ -516,6 +554,10 @@
{"pdist_sokalmichener_bool_wrap", pdist_sokalmichener_bool_wrap, METH_VARARGS},
{"pdist_sokalsneath_bool_wrap", pdist_sokalsneath_bool_wrap, METH_VARARGS},
{"pdist_yule_bool_wrap", pdist_yule_bool_wrap, METH_VARARGS},
+ {"to_squareform_from_vector_wrap",
+ to_squareform_from_vector_wrap, METH_VARARGS},
+ {"to_vector_from_squareform_wrap",
+ to_vector_from_squareform_wrap, METH_VARARGS},
{NULL, NULL} /* Sentinel - marks the end of this structure */
};
Modified: trunk/scipy/cluster/src/hierarchy.c
===================================================================
--- trunk/scipy/cluster/src/hierarchy.c 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/src/hierarchy.c 2008-08-23 05:21:05 UTC (rev 4666)
@@ -871,32 +871,6 @@
free(centroids);
}
-void dist_to_squareform_from_vector(double *M, const double *v, int n) {
- double *it;
- const double *cit;
- int i, j;
- cit = v;
- for (i = 0; i < n - 1; i++) {
- it = M + (i * n) + i + 1;
- for (j = i + 1; j < n; j++, it++, cit++) {
- *it = *cit;
- }
- }
-}
-
-void dist_to_vector_from_squareform(const double *M, double *v, int n) {
- double *it;
- const double *cit;
- int i, j;
- it = v;
- for (i = 0; i < n - 1; i++) {
- cit = M + (i * n) + i + 1;
- for (j = i + 1; j < n; j++, it++, cit++) {
- *it = *cit;
- }
- }
-}
-
void cpy_to_tree(const double *Z, cnode **tnodes, int n) {
const double *row;
cnode *node;
Modified: trunk/scipy/cluster/src/hierarchy.h
===================================================================
--- trunk/scipy/cluster/src/hierarchy.h 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/src/hierarchy.h 2008-08-23 05:21:05 UTC (rev 4666)
@@ -86,9 +86,6 @@
typedef void (distfunc) (cinfo *info, int mini, int minj, int np, int n);
-void dist_to_squareform_from_vector(double *M, const double *v, int n);
-void dist_to_vector_from_squareform(const double *M, double *v, int n);
-
void inconsistency_calculation(const double *Z, double *R, int n, int d);
void inconsistency_calculation_alt(const double *Z, double *R, int n, int d);
Modified: trunk/scipy/cluster/src/hierarchy_wrap.c
===================================================================
--- trunk/scipy/cluster/src/hierarchy_wrap.c 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/src/hierarchy_wrap.c 2008-08-23 05:21:05 UTC (rev 4666)
@@ -304,7 +304,6 @@
return Py_BuildValue("d", 0.0);
}
-
extern PyObject *chopmin_ns_i_wrap(PyObject *self, PyObject *args) {
int mini, n;
PyArrayObject *row;
@@ -332,44 +331,7 @@
return Py_BuildValue("d", 0.0);
}
-extern PyObject *to_squareform_from_vector_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *M_, *v_;
- int n;
- const double *v;
- double *M;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &M_,
- &PyArray_Type, &v_)) {
- return 0;
- }
- else {
- M = (double*)M_->data;
- v = (const double*)v_->data;
- n = M_->dimensions[0];
- dist_to_squareform_from_vector(M, v, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-extern PyObject *to_vector_from_squareform_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *M_, *v_;
- int n;
- double *v;
- const double *M;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &M_,
- &PyArray_Type, &v_)) {
- return 0;
- }
- else {
- M = (const double*)M_->data;
- v = (double*)v_->data;
- n = M_->dimensions[0];
- dist_to_vector_from_squareform(M, v, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
extern PyObject *leaders_wrap(PyObject *self, PyObject *args) {
PyArrayObject *Z_, *T_, *L_, *M_;
int kk, n, res;
@@ -408,10 +370,6 @@
{"linkage_euclid_wrap", linkage_euclid_wrap, METH_VARARGS},
{"linkage_wrap", linkage_wrap, METH_VARARGS},
{"prelist_wrap", prelist_wrap, METH_VARARGS},
- {"to_squareform_from_vector_wrap",
- to_squareform_from_vector_wrap, METH_VARARGS},
- {"to_vector_from_squareform_wrap",
- to_vector_from_squareform_wrap, METH_VARARGS},
{NULL, NULL} /* Sentinel - marks the end of this structure */
};
Modified: trunk/scipy/cluster/tests/test_distance.py
===================================================================
--- trunk/scipy/cluster/tests/test_distance.py 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/tests/test_distance.py 2008-08-23 05:21:05 UTC (rev 4666)
@@ -39,8 +39,8 @@
import numpy as np
from numpy.testing import *
-from scipy.cluster.hierarchy import squareform, linkage, from_mlab_linkage, numobs_dm, numobs_y, numobs_linkage
-from scipy.cluster.distance import pdist, matching, jaccard, dice, sokalsneath, rogerstanimoto, russellrao, yule
+from scipy.cluster.hierarchy import linkage, from_mlab_linkage, numobs_dm, numobs_y, numobs_linkage
+from scipy.cluster.distance import squareform, pdist, matching, jaccard, dice, sokalsneath, rogerstanimoto, russellrao, yule
#from scipy.cluster.hierarchy import pdist, euclidean
@@ -946,3 +946,69 @@
def within_tol(a, b, tol):
return np.abs(a - b).max() < tol
+
+
+class TestSquareForm(TestCase):
+
+ ################### squareform
+ def test_squareform_empty_matrix(self):
+ "Tests squareform on an empty matrix."
+ A = np.zeros((0,0))
+ rA = squareform(np.array(A, dtype='double'))
+ self.failUnless(rA.shape == (0,))
+
+ def test_squareform_empty_vector(self):
+ v = np.zeros((0,))
+ rv = squareform(np.array(v, dtype='double'))
+ self.failUnless(rv.shape == (1,1))
+ self.failUnless(rv[0, 0] == 0)
+
+ def test_squareform_1by1_matrix(self):
+ "Tests squareform on a 1x1 matrix."
+ A = np.zeros((1,1))
+ rA = squareform(np.array(A, dtype='double'))
+ self.failUnless(rA.shape == (0,))
+
+ def test_squareform_one_vector(self):
+ "Tests squareform on a 1-D array, length=1."
+ v = np.ones((1,)) * 8.3
+ rv = squareform(np.array(v, dtype='double'))
+ self.failUnless(rv.shape == (2,2))
+ self.failUnless(rv[0,1] == 8.3)
+ self.failUnless(rv[1,0] == 8.3)
+
+ def test_squareform_2by2_matrix(self):
+ "Tests squareform on a 2x2 matrix."
+ A = np.zeros((2,2))
+ A[0,1]=0.8
+ A[1,0]=0.8
+ rA = squareform(np.array(A, dtype='double'))
+ self.failUnless(rA.shape == (1,))
+ self.failUnless(rA[0] == 0.8)
+
+ def test_squareform_multi_matrix(self):
+ "Tests squareform on a square matrices of multiple sizes."
+ for n in xrange(2, 5):
+ yield self.check_squareform_multi_matrix(n)
+
+ def check_squareform_multi_matrix(self, n):
+ X = np.random.rand(n, 4)
+ Y = pdist(X)
+ self.failUnless(len(Y.shape) == 1)
+ A = squareform(Y)
+ Yr = squareform(A)
+ s = A.shape
+ k = 0
+ if verbose >= 3:
+ print A.shape, Y.shape, Yr.shape
+ self.failUnless(len(s) == 2)
+ self.failUnless(len(Yr.shape) == 1)
+ self.failUnless(s[0] == s[1])
+ for i in xrange(0, s[0]):
+ for j in xrange(i+1, s[1]):
+ if i != j:
+ #print i, j, k, A[i, j], Y[k]
+ self.failUnless(A[i, j] == Y[k])
+ k += 1
+ else:
+ self.failUnless(A[i, j] == 0)
Modified: trunk/scipy/cluster/tests/test_hierarchy.py
===================================================================
--- trunk/scipy/cluster/tests/test_hierarchy.py 2008-08-23 00:42:16 UTC (rev 4665)
+++ trunk/scipy/cluster/tests/test_hierarchy.py 2008-08-23 05:21:05 UTC (rev 4666)
@@ -39,8 +39,8 @@
import numpy as np
from numpy.testing import *
-from scipy.cluster.hierarchy import squareform, linkage, from_mlab_linkage, numobs_dm, numobs_y, numobs_linkage, inconsistent
-from scipy.cluster.distance import pdist, matching, jaccard, dice, sokalsneath, rogerstanimoto, russellrao, yule
+from scipy.cluster.hierarchy import linkage, from_mlab_linkage, numobs_dm, numobs_y, numobs_linkage, inconsistent
+from scipy.cluster.distance import squareform, pdist, matching, jaccard, dice, sokalsneath, rogerstanimoto, russellrao, yule
_tdist = np.array([[0, 662, 877, 255, 412, 996],
[662, 0, 295, 468, 268, 400],
@@ -86,71 +86,6 @@
load_testing_files()
-class TestSquareForm(TestCase):
-
- ################### squareform
- def test_squareform_empty_matrix(self):
- "Tests squareform on an empty matrix."
- A = np.zeros((0,0))
- rA = squareform(np.array(A, dtype='double'))
- self.failUnless(rA.shape == (0,))
-
- def test_squareform_empty_vector(self):
- v = np.zeros((0,))
- rv = squareform(np.array(v, dtype='double'))
- self.failUnless(rv.shape == (1,1))
- self.failUnless(rv[0, 0] == 0)
-
- def test_squareform_1by1_matrix(self):
- "Tests squareform on a 1x1 matrix."
- A = np.zeros((1,1))
- rA = squareform(np.array(A, dtype='double'))
- self.failUnless(rA.shape == (0,))
-
- def test_squareform_one_vector(self):
- "Tests squareform on a 1-D array, length=1."
- v = np.ones((1,)) * 8.3
- rv = squareform(np.array(v, dtype='double'))
- self.failUnless(rv.shape == (2,2))
- self.failUnless(rv[0,1] == 8.3)
- self.failUnless(rv[1,0] == 8.3)
-
- def test_squareform_2by2_matrix(self):
- "Tests squareform on a 2x2 matrix."
- A = np.zeros((2,2))
- A[0,1]=0.8
- A[1,0]=0.8
- rA = squareform(np.array(A, dtype='double'))
- self.failUnless(rA.shape == (1,))
- self.failUnless(rA[0] == 0.8)
-
- def test_squareform_multi_matrix(self):
- "Tests squareform on a square matrices of multiple sizes."
- for n in xrange(2, 5):
- yield self.check_squareform_multi_matrix(n)
-
- def check_squareform_multi_matrix(self, n):
- X = np.random.rand(n, 4)
- Y = pdist(X)
- self.failUnless(len(Y.shape) == 1)
- A = squareform(Y)
- Yr = squareform(A)
- s = A.shape
- k = 0
- if verbose >= 3:
- print A.shape, Y.shape, Yr.shape
- self.failUnless(len(s) == 2)
- self.failUnless(len(Yr.shape) == 1)
- self.failUnless(s[0] == s[1])
- for i in xrange(0, s[0]):
- for j in xrange(i+1, s[1]):
- if i != j:
- #print i, j, k, A[i, j], Y[k]
- self.failUnless(A[i, j] == Y[k])
- k += 1
- else:
- self.failUnless(A[i, j] == 0)
-
class TestNumObs(TestCase):
############## numobs_dm
More information about the Scipy-svn
mailing list