[Scipy-svn] r2943 - trunk/Lib/cluster
scipy-svn at scipy.org
scipy-svn at scipy.org
Thu Apr 26 06:01:41 EDT 2007
Author: cdavid
Date: 2007-04-26 05:01:36 -0500 (Thu, 26 Apr 2007)
New Revision: 2943
Modified:
trunk/Lib/cluster/vq.py
Log:
Convert docstrings to new numpy format
Modified: trunk/Lib/cluster/vq.py
===================================================================
--- trunk/Lib/cluster/vq.py 2007-04-26 08:56:17 UTC (rev 2942)
+++ trunk/Lib/cluster/vq.py 2007-04-26 10:01:36 UTC (rev 2943)
@@ -15,8 +15,11 @@
Train a codebook for mimimum distortion using the kmeans algorithm
"""
+__docformat__ = 'restructuredtext'
+
__all__ = ['whiten', 'vq', 'kmeans']
+
from numpy.random import randint
from numpy import shape, zeros, sqrt, argmin, minimum, array, \
newaxis, arange, compress, equal, common_type, single, double, take, \
@@ -24,47 +27,44 @@
import numpy as N
def whiten(obs):
- """ Normalize a group of observations on a per feature basis
+ """ Normalize a group of observations on a per feature basis.
- Description
+ Before running kmeans algorithms, it is beneficial to "whiten", or
+ scale, the observation data on a per feature basis. This is done
+ by dividing each feature by its standard deviation across all
+ observations.
- Before running kmeans algorithms, it is beneficial to "whiten", or
- scale, the observation data on a per feature basis. This is done
- by dividing each feature by its standard deviation across all
- observations.
+ :Parameters:
+ obs : ndarray
+ Each row of the array is an observation. The
+ columns are the "features" seen during each observation
+ ::
- Arguments
+ # f0 f1 f2
+ obs = [[ 1., 1., 1.], #o0
+ [ 2., 2., 2.], #o1
+ [ 3., 3., 3.], #o2
+ [ 4., 4., 4.]]) #o3
- obs -- 2D array.
- Each row of the array is an observation. The
- columns are the "features" seen during each observation
-
- # f0 f1 f2
- obs = [[ 1., 1., 1.], #o0
- [ 2., 2., 2.], #o1
- [ 3., 3., 3.], #o2
- [ 4., 4., 4.]]) #o3
-
XXX perhaps should have an axis variable here.
- Outputs
+ :Returns:
+ result : ndarray
+ Contains the values in obs scaled by the standard devation
+ of each column.
- result -- 2D array.
- Contains the values in obs scaled by the standard devation
- of each column.
+ Examples
+ --------
- Test
-
- >>> from numpy import array
- >>> from scipy.cluster.vq import whiten
- >>> features = array([[ 1.9,2.3,1.7],
- ... [ 1.5,2.5,2.2],
- ... [ 0.8,0.6,1.7,]])
- >>> whiten(features)
- array([[ 3.41250074, 2.20300046, 5.88897275],
- [ 2.69407953, 2.39456571, 7.62102355],
- [ 1.43684242, 0.57469577, 5.88897275]])
-
+ >>> from numpy import array
+ >>> from scipy.cluster.vq import whiten
+ >>> features = array([[ 1.9,2.3,1.7],
+ ... [ 1.5,2.5,2.2],
+ ... [ 0.8,0.6,1.7,]])
+ >>> whiten(features)
+ array([[ 3.41250074, 2.20300046, 5.88897275],
+ [ 2.69407953, 2.39456571, 7.62102355],
+ [ 1.43684242, 0.57469577, 5.88897275]])
"""
std_dev = std(obs, axis=0)
return obs / std_dev
@@ -72,57 +72,56 @@
def vq(obs, code_book):
""" Vector Quantization: assign features sets to codes in a code book.
- Description:
- Vector quantization determines which code in the code book best
- represents an observation of a target. The features of each
- observation are compared to each code in the book, and assigned
- the one closest to it. The observations are contained in the obs
- array. These features should be "whitened," or nomalized by the
- standard deviation of all the features before being quantized.
- The code book can be created using the kmeans algorithm or
- something similar.
+ Vector quantization determines which code in the code book best represents
+ an observation of a target. The features of each observation are compared
+ to each code in the book, and assigned the one closest to it. The
+ observations are contained in the obs array. These features should be
+ "whitened," or nomalized by the standard deviation of all the features
+ before being quantized. The code book can be created using the kmeans
+ algorithm or something similar.
- Note:
- This currently forces 32 bit math precision for speed. Anyone know
- of a situation where this undermines the accuracy of the algorithm?
+ :Parameters:
+ obs : ndarray
+ Each row of the array is an observation. The columns are the
+ "features" seen during each observation The features must be
+ whitened first using the whiten function or something equivalent.
+ code_book : ndarray.
+ The code book is usually generated using the kmeans algorithm.
+ Each row of the array holds a different code, and the columns are
+ the features of the code.
+ ::
- Arguments:
- obs -- 2D array.
- Each row of the array is an observation. The
- columns are the "features" seen during each observation
- The features must be whitened first using the
- whiten function or something equivalent.
- code_book -- 2D array.
- The code book is usually generated using the kmeans
- algorithm. Each row of the array holds a different
- code, and the columns are the features of the code.
- # f0 f1 f2 f3
- code_book = [[ 1., 2., 3., 4.], #c0
- [ 1., 2., 3., 4.], #c1
- [ 1., 2., 3., 4.]]) #c2
- Outputs:
- code -- 1D array.
- If obs is a NxM array, then a length N array
- is returned that holds the selected code book index for
- each observation.
- dist -- 1D array.
- The distortion (distance) between the observation and
- its nearest code
- Reference
+ # f0 f1 f2 f3
+ code_book = [[ 1., 2., 3., 4.], #c0
+ [ 1., 2., 3., 4.], #c1
+ [ 1., 2., 3., 4.]]) #c2
- Test
+ :Returns:
+ code : ndarray
+ If obs is a NxM array, then a length N array is returned that holds
+ the selected code book index for each observation.
+ dist : ndarray
+ The distortion (distance) between the observation and its nearest
+ code
- >>> from numpy import array
- >>> from scipy.cluster.vq import vq
- >>> code_book = array([[1.,1.,1.],
- ... [2.,2.,2.]])
- >>> features = array([[ 1.9,2.3,1.7],
- ... [ 1.5,2.5,2.2],
- ... [ 0.8,0.6,1.7]])
- >>> vq(features,code_book)
- (array([1, 1, 0],'i'), array([ 0.43588989, 0.73484692, 0.83066239]))
+ Notes
+ -----
+ This currently forces 32 bit math precision for speed. Anyone know
+ of a situation where this undermines the accuracy of the algorithm?
+ Examples
+ --------
+ >>> from numpy import array
+ >>> from scipy.cluster.vq import vq
+ >>> code_book = array([[1.,1.,1.],
+ ... [2.,2.,2.]])
+ >>> features = array([[ 1.9,2.3,1.7],
+ ... [ 1.5,2.5,2.2],
+ ... [ 0.8,0.6,1.7]])
+ >>> vq(features,code_book)
+ (array([1, 1, 0],'i'), array([ 0.43588989, 0.73484692, 0.83066239]))
+
"""
try:
import _vq
@@ -225,32 +224,36 @@
return code, min_dist
def kmeans_(obs, guess, thresh=1e-5):
- """ See kmeans
+ """ "raw" version of kmeans.
- Outputs
+ :Returns:
+ code_book :
+ the lowest distortion codebook found.
+ avg_dist :
+ the average distance a observation is from a code in the book.
+ Lower means the code_book matches the data better.
- code_book -- the lowest distortion codebook found.
- avg_dist -- the average distance a observation is
- from a code in the book. Lower means
- the code_book matches the data better.
+ :SeeAlso:
+ - kmeans : wrapper around kmeans
XXX should have an axis variable here.
- Test
+ Examples
+ --------
- Note: not whitened in this example.
+ Note: not whitened in this example.
- >>> from numpy import array
- >>> from scipy.cluster.vq import kmeans_
- >>> features = array([[ 1.9,2.3],
- ... [ 1.5,2.5],
- ... [ 0.8,0.6],
- ... [ 0.4,1.8],
- ... [ 1.0,1.0]])
- >>> book = array((features[0],features[2]))
- >>> kmeans_(features,book)
- (array([[ 1.7 , 2.4 ],
- [ 0.73333333, 1.13333333]]), 0.40563916697728591)
+ >>> from numpy import array
+ >>> from scipy.cluster.vq import kmeans_
+ >>> features = array([[ 1.9,2.3],
+ ... [ 1.5,2.5],
+ ... [ 0.8,0.6],
+ ... [ 0.4,1.8],
+ ... [ 1.0,1.0]])
+ >>> book = array((features[0],features[2]))
+ >>> kmeans_(features,book)
+ (array([[ 1.7 , 2.4 ],
+ [ 0.73333333, 1.13333333]]), 0.40563916697728591)
"""
@@ -278,67 +281,61 @@
return code_book, avg_dist[-1]
def kmeans(obs, k_or_guess, iter=20, thresh=1e-5):
- """ Generate a code book with minimum distortion
+ """ Generate a code book with minimum distortion.
- Description
-
- Arguments
-
- obs -- 2D array
- Each row of the array is an observation. The
- columns are the "features" seen during each observation
- The features must be whitened first using the
- whiten function or something equivalent.
- k_or_guess -- integer or 2D array.
- If integer, it is the number of code book elements.
- If a 2D array, the array is used as the intial guess for
- the code book. The array should have k rows, and the
- same number of columns (features) as the obs array.
- iter -- integer.
- The number of times to restart the kmeans algorithm with
- a new initial guess. If k_or_guess is a 2D array (codebook),
- this argument is ignored and only 1 iteration is run.
- thresh -- float
- Terminate each kmeans run when the distortion change from
- one iteration to the next is less than this value.
- Outputs
-
- codesbook -- 2D array.
+ :Parameters:
+ obs : ndarray
+ Each row of the array is an observation. The columns are the
+ "features" seen during each observation The features must be
+ whitened first using the whiten function or something equivalent.
+ k_or_guess : int or ndarray
+ If integer, it is the number of code book elements. If a 2D array,
+ the array is used as the intial guess for the code book. The array
+ should have k rows, and the same number of columns (features) as
+ the obs array.
+ iter : int
+ The number of times to restart the kmeans algorithm with a new
+ initial guess. If k_or_guess is a 2D array (codebook), this
+ argument is ignored and only 1 iteration is run.
+ thresh : float
+ Terminate each kmeans run when the distortion change from one
+ iteration to the next is less than this value.
+ :Returns:
+ codesbook : ndarray
The codes that best fit the observation
- distortion -- float
+ distortion : float
The distortion between the observations and the codes.
- Reference
+ Examples
+ --------
- Test
+ ("Not checked carefully for accuracy..." he said sheepishly)
- ("Not checked carefully for accuracy..." he said sheepishly)
+ >>> from numpy import array
+ >>> from scipy.cluster.vq import vq, kmeans
+ >>> features = array([[ 1.9,2.3],
+ ... [ 1.5,2.5],
+ ... [ 0.8,0.6],
+ ... [ 0.4,1.8],
+ ... [ 0.1,0.1],
+ ... [ 0.2,1.8],
+ ... [ 2.0,0.5],
+ ... [ 0.3,1.5],
+ ... [ 1.0,1.0]])
+ >>> whitened = whiten(features)
+ >>> book = array((whitened[0],whitened[2]))
+ >>> kmeans(whitened,book)
+ (array([[ 2.3110306 , 2.86287398],
+ [ 0.93218041, 1.24398691]]), 0.85684700941625547)
- >>> from numpy import array
- >>> from scipy.cluster.vq import vq, kmeans
- >>> features = array([[ 1.9,2.3],
- ... [ 1.5,2.5],
- ... [ 0.8,0.6],
- ... [ 0.4,1.8],
- ... [ 0.1,0.1],
- ... [ 0.2,1.8],
- ... [ 2.0,0.5],
- ... [ 0.3,1.5],
- ... [ 1.0,1.0]])
- >>> whitened = whiten(features)
- >>> book = array((whitened[0],whitened[2]))
- >>> kmeans(whitened,book)
- (array([[ 2.3110306 , 2.86287398],
- [ 0.93218041, 1.24398691]]), 0.85684700941625547)
+ >>> import RandomArray
+ >>> RandomArray.seed(1000,2000)
+ >>> codes = 3
+ >>> kmeans(whitened,codes)
+ (array([[ 2.3110306 , 2.86287398],
+ [ 1.32544402, 0.65607529],
+ [ 0.40782893, 2.02786907]]), 0.5196582527686241)
- >>> import RandomArray
- >>> RandomArray.seed(1000,2000)
- >>> codes = 3
- >>> kmeans(whitened,codes)
- (array([[ 2.3110306 , 2.86287398],
- [ 1.32544402, 0.65607529],
- [ 0.40782893, 2.02786907]]), 0.5196582527686241)
-
"""
if int(iter) < 1:
raise ValueError, 'iter must be >= to 1.'
More information about the Scipy-svn
mailing list