[Scipy-svn] r2285 - in trunk/Lib/sandbox/pyem: . profile_data pyem pyem/profile_data pyem/src pyem/tests src tests

Thu Oct 12 09:49:18 EDT 2006

Author: cdavid
Date: 2006-10-12 08:48:58 -0500 (Thu, 12 Oct 2006)
New Revision: 2285

Added:
   trunk/Lib/sandbox/pyem/__init__.py
   trunk/Lib/sandbox/pyem/_c_densities.py
   trunk/Lib/sandbox/pyem/densities.py
   trunk/Lib/sandbox/pyem/gauss_mix.py
   trunk/Lib/sandbox/pyem/gmm_em.py
   trunk/Lib/sandbox/pyem/info.py
   trunk/Lib/sandbox/pyem/kmean.py
   trunk/Lib/sandbox/pyem/online_em.py
   trunk/Lib/sandbox/pyem/profile_data/
   trunk/Lib/sandbox/pyem/profile_data/profile_densities.py
   trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py
   trunk/Lib/sandbox/pyem/setup.py
   trunk/Lib/sandbox/pyem/src/
   trunk/Lib/sandbox/pyem/src/Makefile
   trunk/Lib/sandbox/pyem/src/c_gden.c
   trunk/Lib/sandbox/pyem/src/c_gmm.pyx
   trunk/Lib/sandbox/pyem/src/c_numpy.pxd
   trunk/Lib/sandbox/pyem/src/c_python.pxd
   trunk/Lib/sandbox/pyem/tests/
   trunk/Lib/sandbox/pyem/tests/test_densities.py
   trunk/Lib/sandbox/pyem/tests/test_kmean.py
Removed:
   trunk/Lib/sandbox/pyem/count.sh
   trunk/Lib/sandbox/pyem/pyem/__init__.py
   trunk/Lib/sandbox/pyem/pyem/_c_densities.py
   trunk/Lib/sandbox/pyem/pyem/densities.py
   trunk/Lib/sandbox/pyem/pyem/gauss_mix.py
   trunk/Lib/sandbox/pyem/pyem/gmm_em.py
   trunk/Lib/sandbox/pyem/pyem/info.py
   trunk/Lib/sandbox/pyem/pyem/kmean.py
   trunk/Lib/sandbox/pyem/pyem/online_em.py
   trunk/Lib/sandbox/pyem/pyem/profile_data/profile_densities.py
   trunk/Lib/sandbox/pyem/pyem/profile_data/profile_gmm.py
   trunk/Lib/sandbox/pyem/pyem/setup.py
   trunk/Lib/sandbox/pyem/pyem/src/Makefile
   trunk/Lib/sandbox/pyem/pyem/src/c_gden.c
   trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
   trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd
   trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd
   trunk/Lib/sandbox/pyem/pyem/tests/test_densities.py
   trunk/Lib/sandbox/pyem/pyem/tests/test_kmean.py
Log:
[pyem @ david at ar.media.kyoto-u.ac.jp-20061012122133-98999563270b6770]
Change of layout for scipy (2)
David Cournapeau <david at ar.media.kyoto-u.ac.jp> | 2006-10-12 21:21:33 +0900 (Thu, 12 Oct 2006)

Added: trunk/Lib/sandbox/pyem/__init__.py
===================================================================

--- trunk/Lib/sandbox/pyem/__init__.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/__init__.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,12 @@
+#! /usr/bin/env python
+# Last Change: Tue Oct 03 06:00 PM 2006 J
+
+from info import __doc__
+
+from numpy.testing import NumpyTest
+test = NumpyTest().test
+
+version = '0.5.3'
+
+from gauss_mix import GmParamError, GM
+from gmm_em import GmmParamError, GMM, EM

Added: trunk/Lib/sandbox/pyem/_c_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/_c_densities.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/_c_densities.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,217 @@
+#! /usr/bin/python
+#
+# Copyrighted David Cournapeau
+# Last Change: Tue Oct 03 05:00 PM 2006 J
+
+# This module uses a C implementation through ctypes, for diagonal cases
+# TODO:
+#   - portable way to find/open the shared library
+#   - full cov matrice
+
+import numpy as N
+import numpy.linalg as lin
+from numpy.random import randn
+from scipy.stats import chi2
+import densities as D
+
+import ctypes
+from ctypes import cdll, c_uint, c_int, c_double, POINTER
+from numpy.ctypeslib import ndpointer, load_library
+
+ctypes_major    = int(ctypes.__version__.split('.')[0])
+if ctypes_major < 1:
+    msg =  "version of ctypes is %s, expected at least %s" \
+            % (ctypes.__version__, '1.0.0')
+    raise Exception(msg)
+
+# Requirements for diag gden
+_gden   = load_library('c_gden.so', __file__)
+arg1    = ndpointer(dtype='<f8')
+arg2    = c_uint
+arg3    = c_uint
+arg4    = ndpointer(dtype='<f8')
+arg5    = ndpointer(dtype='<f8')
+arg6    = ndpointer(dtype='<f8')
+_gden.gden_diag.argtypes    = [arg1, arg2, arg3, arg4, arg5, arg6]
+_gden.gden_diag.restype     = c_int
+
+# Error classes
+class DenError(Exception):
+    """Base class for exceptions in this module.
+    
+    Attributes:
+        expression -- input expression in which the error occurred
+        message -- explanation of the error"""
+    def __init__(self, message):
+        self.message    = message
+    
+    def __str__(self):
+        return self.message
+
+# The following function do all the fancy stuff to check that parameters
+# are Ok, and call the right implementation if args are OK.
+def gauss_den(x, mu, va, log = False):
+    """ Compute multivariate Gaussian density at points x for 
+    mean mu and variance va.
+    
+    Vector are row vectors, except va which can be a matrix
+    (row vector variance for diagonal variance)
+    
+    If log is True, than the log density is returned 
+    (useful for underflow ?)"""
+    mu  = N.atleast_2d(mu)
+    va  = N.atleast_2d(va)
+    x   = N.atleast_2d(x)
+    
+    #=======================#
+    # Checking parameters   #
+    #=======================#
+    if len(N.shape(mu)) != 2:
+        raise DenError("mu is not rank 2")
+        
+    if len(N.shape(va)) != 2:
+        raise DenError("va is not rank 2")
+        
+    if len(N.shape(x)) != 2:
+        raise DenError("x is not rank 2")
+        
+    (n, d)      = x.shape
+    (dm0, dm1)  = mu.shape
+    (dv0, dv1)  = va.shape
+    
+    # Check x and mu same dimension
+    if dm0 != 1:
+        msg = "mean must be a row vector!"
+        raise DenError(msg)
+    if dm1 != d:
+        msg = "x and mu not same dim"
+        raise DenError(msg)
+    # Check va and mu same size
+    if dv1 != d:
+        msg = "mu and va not same dim"
+        raise DenError(msg)
+    if dv0 != 1 and dv0 != d:
+        msg = "va not square"
+        raise DenError(msg)
+
+    #===============#
+    # Computation   #
+    #===============#
+    if d == 1:
+        # scalar case
+        return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log)
+    elif dv0 == 1:
+        # Diagonal matrix case
+        return _diag_gauss_den(x, mu, va, log)
+    elif dv1 == dv0:
+        # full case
+        return  _full_gauss_den(x, mu, va, log)
+    else:
+        raise DenError("variance mode not recognized, this is a bug")
+
+# Those 3 functions do almost all the actual computation
+def _scalar_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in scalar case. It assumes all args
+    are conformant, so it should not be used directly
+    
+    ** Expect centered data (ie with mean removed) **
+
+    Call gauss_den instead"""
+    d       = mu.size
+    inva    = 1/va
+    fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+    y       = ((x-mu) ** 2) * -0.5 * inva
+    if not log:
+        y   = fac * N.exp(y)
+    else:
+        y   = y + log(fac)
+
+    return y
+    
+def _diag_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in scalar case. It assumes all args
+    are conformant, so it should not be used directly
+    
+    ** Expect centered data (ie with mean removed) **
+
+    Call gauss_den instead"""
+    # Diagonal matrix case
+    d   = mu.size
+    n   = x.shape[0]
+    if not log:
+        y       = N.zeros(n)
+        vat     = va.copy()
+        # _gden.gden_diag(N.require(x, requirements = 'C'), n, d, 
+        #         N.require(mu, requirements = 'C'),
+        #         N.require(inva, requirements = 'C'),
+        #         N.require(y, requirements = 'C'))
+        x       = N.require(x, requirements = 'C')
+        mu      = N.require(mu, requirements = 'C')
+        vat     = N.require(vat, requirements = 'C')
+        y       = N.require(y, requirements = 'C')
+        _gden.gden_diag(x, n, d, mu, vat, y)
+        return y
+        # _gden.gden_diag.restype     = c_int
+        # _gden.gden_diag.argtypes    = [POINTER(c_double), c_uint, c_uint,
+        #         POINTER(c_double), POINTER(c_double), POINTER(c_double)]
+
+        # y   = N.zeros(n)
+        # inva= 1/va
+        # _gden.gden_diag(x.ctypes.data_as(POINTER(c_double)),
+        #     n, d,
+        #     mu.ctypes.data_as(POINTER(c_double)),
+        #     inva.ctypes.data_as(POINTER(c_double)),
+        #     y.ctypes.data_as(POINTER(c_double)))
+    else:
+        y   = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
+        for i in range(1, d):
+            y    +=  _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+        return y
+
+def _full_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in full matrix case. 
+    
+    It assumes all args are conformant, so it should 
+    not be used directly Call gauss_den instead
+    
+    ** Expect centered data (ie with mean removed) **
+
+    Does not check if va is definite positive (on inversible 
+    for that matter), so the inverse computation and/or determinant
+    would throw an exception."""
+    d       = mu.size
+    inva    = lin.inv(va)
+    fac     = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
+
+    # we are using a trick with sum to "emulate" 
+    # the matrix multiplication inva * x without any explicit loop
+    y   = N.dot((x-mu), inva)
+    y   = -0.5 * N.sum(y * (x-mu), 1)
+
+    if not log:
+        y   = fac * N.exp(y)
+    else:
+        y   = y + N.log(fac)
+ 
+    return y
+
+if __name__ == "__main__":
+    #=========================================
+    # Test accuracy between pure and C python
+    #=========================================
+    mu  = N.array([2.0, 3])
+    va  = N.array([5.0, 3])
+
+    # Generate a multivariate gaussian of mean mu and covariance va
+    nframes = 1e4
+    X       = randn(nframes, 2)
+    Yc      = N.dot(N.diag(N.sqrt(va)), X.transpose())
+    Yc      = Yc.transpose() + mu
+
+    Y   = D.gauss_den(Yc, mu, va)
+    Yt  = gauss_den(Yc, mu, va)
+
+    print "Diff is " + str(N.sqrt(N.sum((Y-Yt) ** 2))/nframes/2)

Deleted: trunk/Lib/sandbox/pyem/count.sh
===================================================================
--- trunk/Lib/sandbox/pyem/count.sh	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/count.sh	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,23 +0,0 @@
-#! /bin/sh
-# Last Change: Fri Oct 06 08:00 PM 2006 J
-
-n=0
-np=0
-nc=0
-
-files=`find . -name '*.py'`
-
-for i in $files; do
-    tp=`wc "$i" | tr -s " " | cut -f 2 -d" "`
-    let np="$np + $tp"
-done
-
-files=`find . -name '*.[ch]'`
-
-for i in $files; do
-    tp=`wc "$i" | tr -s " " | cut -f 2 -d" "`
-    let nc="$nc + $tp"
-done
-
-echo "$nc lines of C code"
-echo "$np lines of python code"

Added: trunk/Lib/sandbox/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/densities.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/densities.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,257 @@
+#! /usr/bin/python
+#
+# Copyrighted David Cournapeau
+# Last Change: Thu Oct 05 07:00 PM 2006 J
+
+import numpy as N
+import numpy.linalg as lin
+from numpy.random import randn
+from scipy.stats import chi2
+
+# Error classes
+class DenError(Exception):
+    """Base class for exceptions in this module.
+    
+    Attributes:
+        expression -- input expression in which the error occurred
+        message -- explanation of the error"""
+    def __init__(self, message):
+        self.message    = message
+    
+    def __str__(self):
+        return self.message
+
+# The following function do all the fancy stuff to check that parameters
+# are Ok, and call the right implementation if args are OK.
+def gauss_den(x, mu, va, log = False):
+    """ Compute multivariate Gaussian density at points x for 
+    mean mu and variance va.
+    
+    Vector are row vectors, except va which can be a matrix
+    (row vector variance for diagonal variance)
+    
+    If log is True, than the log density is returned 
+    (useful for underflow ?)"""
+    mu  = N.atleast_2d(mu)
+    va  = N.atleast_2d(va)
+    x   = N.atleast_2d(x)
+    
+    #=======================#
+    # Checking parameters   #
+    #=======================#
+    if len(N.shape(mu)) != 2:
+        raise DenError("mu is not rank 2")
+        
+    if len(N.shape(va)) != 2:
+        raise DenError("va is not rank 2")
+        
+    if len(N.shape(x)) != 2:
+        raise DenError("x is not rank 2")
+        
+    (n, d)      = x.shape
+    (dm0, dm1)  = mu.shape
+    (dv0, dv1)  = va.shape
+    
+    # Check x and mu same dimension
+    if dm0 != 1:
+        msg = "mean must be a row vector!"
+        raise DenError(msg)
+    if dm1 != d:
+        msg = "x and mu not same dim"
+        raise DenError(msg)
+    # Check va and mu same size
+    if dv1 != d:
+        msg = "mu and va not same dim"
+        raise DenError(msg)
+    if dv0 != 1 and dv0 != d:
+        msg = "va not square"
+        raise DenError(msg)
+
+    #===============#
+    # Computation   #
+    #===============#
+    if d == 1:
+        # scalar case
+        return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log)
+    elif dv0 == 1:
+        # Diagonal matrix case
+        return _diag_gauss_den(x, mu, va, log)
+    elif dv1 == dv0:
+        # full case
+        return  _full_gauss_den(x, mu, va, log)
+    else:
+        raise DenError("variance mode not recognized, this is a bug")
+
+# Those 3 functions do almost all the actual computation
+def _scalar_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in scalar case. It assumes all args
+    are conformant, so it should not be used directly
+    
+    Call gauss_den instead"""
+    d       = mu.size
+    inva    = 1/va
+    fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+    y       = ((x-mu) ** 2) * -0.5 * inva
+    if not log:
+        y   = fac * N.exp(y)
+    else:
+        y   = y + log(fac)
+
+    return y
+    
+#from ctypes import cdll, c_uint, c_int, c_double, POINTER
+#_gden   = cdll.LoadLibrary('src/libgden.so')
+#_gden.gden_diag.restype     = c_int
+#_gden.gden_diag.argtypes    = [POINTER(c_double), c_uint, c_uint,
+#        POINTER(c_double), POINTER(c_double), POINTER(c_double)]
+
+def _diag_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in scalar case. It assumes all args
+    are conformant, so it should not be used directly
+    
+    Call gauss_den instead"""
+    # Diagonal matrix case
+    d   = mu.size
+    n   = x.shape[0]
+    if not log:
+        inva    = 1/va[0,0]
+        fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+        y       =  (x[:,0] - mu[0,0]) ** 2 * inva * -0.5
+        for i in range(1, d):
+            inva    = 1/va[0,i]
+            fac     *= N.sqrt(inva)
+            y       += (x[:,i] - mu[0,i]) ** 2 * inva * -0.5
+        y   = fac * N.exp(y)
+    else:
+        y   = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
+        for i in range(1, d):
+            y    +=  _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+    return y
+
+def _full_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in full matrix case. 
+    
+    It assumes all args are conformant, so it should 
+    not be used directly Call gauss_den instead
+    
+    Does not check if va is definite positive (on inversible 
+    for that matter), so the inverse computation and/or determinant
+    would throw an exception."""
+    d       = mu.size
+    inva    = lin.inv(va)
+    fac     = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
+
+    # # Slow version
+    # n       = N.size(x, 0)
+    # y       = N.zeros(n)
+    # for i in range(n):
+    #     y[i] = N.dot(x[i,:],
+    #              N.dot(inva, N.transpose(x[i,:])))
+    # y *= -0.5
+
+    # we are using a trick with sum to "emulate" 
+    # the matrix multiplication inva * x without any explicit loop
+    y   = N.dot((x-mu), inva)
+    y   = -0.5 * N.sum(y * (x-mu), 1)
+
+    if not log:
+        y   = fac * N.exp(y)
+    else:
+        y   = y + N.log(fac)
+ 
+    return y
+
+# To plot a confidence ellipse from multi-variate gaussian pdf
+def gauss_ell(mu, va, dim = [0, 1], npoints = 100, level = 0.39):
+    """ Given a mean and covariance for multi-variate
+    gaussian, returns npoints points for the ellipse
+    of confidence given by level (all points will be inside
+    the ellipsoides with a probability equal to level)
+    
+    Returns the coordinate x and y of the ellipse"""
+    
+    mu      = N.atleast_1d(mu)
+    va      = N.atleast_1d(va)
+    c       = N.array(dim)
+
+    if mu.size == va.size:
+        mode    = 'diag'
+    else:
+        if va.ndim == 2:
+            if va.shape[0] == va.shape[1]:
+                mode    = 'full'
+            else:
+                raise DenError("variance not square")
+        else:
+            raise DenError("mean and variance are not dim conformant")
+
+    chi22d  = chi2(2)
+    mahal   = N.sqrt(chi22d.ppf(level))
+    
+    # Generates a circle of npoints
+    theta   = N.linspace(0, 2 * N.pi, npoints)
+    circle  = mahal * N.array([N.cos(theta), N.sin(theta)])
+
+    # Get the dimension which we are interested in:
+    mu  = mu[dim]
+    if mode == 'diag':
+        va      = va[dim]
+        elps    = N.outer(mu, N.ones(npoints))
+        elps    += N.dot(N.diag(N.sqrt(va)), circle)
+    elif mode == 'full':
+        va  = va[c,:][:,c]
+        # Method: compute the cholesky decomp of each cov matrix, that is
+        # compute cova such as va = cova * cova' 
+        # WARN: scipy is different than matlab here, as scipy computes a lower
+        # triangular cholesky decomp: 
+        #   - va = cova * cova' (scipy)
+        #   - va = cova' * cova (matlab)
+        # So take care when comparing results with matlab !
+        cova    = lin.cholesky(va)
+        elps    = N.outer(mu, N.ones(npoints))
+        elps    += N.dot(cova, circle)
+    else:
+        raise DenParam("var mode not recognized")
+
+    return elps[0, :], elps[1, :]
+
+if __name__ == "__main__":
+    import pylab
+
+    #=========================================
+    # Test plotting a simple diag 2d variance:
+    #=========================================
+    va  = N.array([5, 3])
+    mu  = N.array([2, 3])
+
+    # Generate a multivariate gaussian of mean mu and covariance va
+    X       = randn(1e3, 2)
+    Yc      = N.dot(N.diag(N.sqrt(va)), X.transpose())
+    Yc      = Yc.transpose() + mu
+
+    # Plotting
+    Xe, Ye  = gauss_ell(mu, va, npoints = 100)
+    pylab.figure()
+    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+    pylab.plot(Xe, Ye, 'r')
+
+    #=========================================
+    # Test plotting a simple full 2d variance:
+    #=========================================
+    va  = N.array([[0.2, 0.1],[0.1, 0.5]])
+    mu  = N.array([0, 3])
+
+    # Generate a multivariate gaussian of mean mu and covariance va
+    X       = randn(1e3, 2)
+    Yc      = N.dot(lin.cholesky(va), X.transpose())
+    Yc      = Yc.transpose() + mu
+
+    # Plotting
+    Xe, Ye  = gauss_ell(mu, va, npoints = 100, level=0.95)
+    pylab.figure()
+    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+    pylab.plot(Xe, Ye, 'r')
+    pylab.show()

Added: trunk/Lib/sandbox/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/gauss_mix.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/gauss_mix.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,438 @@
+# /usr/bin/python
+# Last Change: Tue Oct 03 06:00 PM 2006 J
+
+# Module to implement GaussianMixture class.
+
+import numpy as N
+from numpy.random import randn, rand
+import numpy.linalg as lin
+import densities
+
+MAX_DEV     = 1e-10
+MAX_COND    = 1e10
+
+# Right now, two main usages of a Gaussian Model are possible
+#   - init a Gaussian Model with meta-parameters, and trains it
+#   - set-up a Gaussian Model to sample it, draw ellipsoides 
+#   of confidences. In this case, we would like to init it with
+#   known values of parameters. This can be done with the class method 
+#   fromval
+#
+#   For now, we have to init with meta-parameters, and set 
+#   the parameters afterward. There should be a better way ?
+
+# TODO:
+#   - change bounds methods of GM class instanciations so that it cannot 
+#   be used as long as w, mu and va are not set
+#   - We have to use scipy now for chisquare pdf, so there may be other
+#   methods to be used, ie for implementing random index.
+#   - there is no check on internal state of the GM, that is does w, mu and va values
+#   make sense (eg singular values)
+#   - plot1d is still very rhough. There should be a sensible way to 
+#   modify the result plot (maybe returns a dic with global pdf, component pdf and
+#   fill matplotlib handles). Should be coherent with plot
+class GmParamError:
+    """Exception raised for errors in gmm params
+
+    Attributes:
+        expression -- input expression in which the error occurred
+        message -- explanation of the error
+    """
+    def __init__(self, message):
+        self.message    = message
+    
+    def __str__(self):
+        return self.message
+
+class GM:
+    """Gaussian Mixture class. This is a simple container class
+    to hold Gaussian Mixture parameters (weights, mean, etc...).
+    It can also draw itself (confidence ellipses) and samples itself.
+
+    Is initiated by giving dimension, number of components and 
+    covariance mode"""
+
+    # I am not sure it is useful to have a spherical mode...
+    _cov_mod    = ['diag', 'full']
+
+    def __init__(self, d, k, mode = 'diag'):
+        """Init a Gaussian model of k components, each component being a 
+        d multi-variate Gaussian, with covariance matrix of style mode"""
+        if mode not in self._cov_mod:
+            raise GmParamError("mode %s not recognized" + str(mode))
+
+        self.d      = d
+        self.k      = k
+        self.mode   = mode
+
+        # Init to 0 all parameters, with the right dimensions.
+        # Not sure this is useful in python from an efficiency POV ?
+        self.w   = N.zeros(k)
+        self.mu  = N.zeros((k, d))
+        if mode == 'diag':
+            self.va  = N.zeros((k, d))
+        elif mode == 'full':
+            self.va  = N.zeros((k * d, d))
+
+        self.is_valid   = False
+
+    def set_param(self, weights, mu, sigma):
+        """Set parameters of the model. Args should
+        be conformant with metparameters d and k given during
+        initialisation"""
+        k, d, mode  = check_gmm_param(weights, mu, sigma)
+        if not k == self.k:
+            raise GmParamError("Number of given components is %d, expected %d" 
+                    % (shape(k), shape(self.k)))
+        if not d == self.d:
+            raise GmParamError("Dimension of the given model is %d, expected %d" 
+                    % (shape(d), shape(self.d)))
+        if not mode == self.mode and not d == 1:
+            raise GmParamError("Given covariance mode is %s, expected %s"
+                    % (mode, self.mode))
+        self.w  = weights
+        self.mu = mu
+        self.va = sigma
+
+        self.is_valid   = True
+
+    @classmethod
+    def fromvalues(cls, weights, mu, sigma):
+        """This class method can be used to create a GM model
+        directly from its parameters weights, mean and variance
+        
+        w, mu, va   = GM.gen_param(d, k)
+        gm  = GM(d, k)
+        gm.set_param(w, mu, va)
+
+        and
+        
+        w, mu, va   = GM.gen_param(d, k)
+        gm  = GM.fromvalue(w, mu, va)
+
+        Are equivalent """
+        k, d, mode  = check_gmm_param(weights, mu, sigma)
+        res = cls(d, k, mode)
+        res.set_param(weights, mu, sigma)
+        return res
+        
+    def sample(self, nframes):
+        """ Sample nframes frames from the model """
+        if not self.is_valid:
+            raise GmParamError("""Parameters of the model has not been 
+                set yet, please set them using self.set_param()""")
+
+        # State index (ie hidden var)
+        S   = gen_rand_index(self.w, nframes)
+        # standard gaussian
+        X   = randn(nframes, self.d)        
+
+        if self.mode == 'diag':
+            X   = self.mu[S, :]  + X * N.sqrt(self.va[S,:])
+        elif self.mode == 'full':
+            # Faster:
+            cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1]))
+            for i in range(self.k):
+                # Using cholesky looks more stable than sqrtm; sqrtm is not
+                # available in numpy anyway, only in scipy...
+                cho[i]  = lin.cholesky(self.va[i*self.d:i*self.d+self.d,:])
+
+            for s in range(self.k):
+                tmpind      = N.where(S == s)[0]
+                X[tmpind]   = N.dot(X[tmpind], cho[s].transpose()) + self.mu[s]
+        else:
+            raise GmParamError('cov matrix mode not recognized, this is a bug !')
+
+        return X
+
+    def conf_ellipses(self, *args, **kargs):
+        """Returns a list of confidence ellipsoids describing the Gmm
+        defined by mu and va. Check densities.gauss_ell for details
+
+        Returns:
+            -Xe:    a list of x coordinates for the ellipses (Xe[i] is
+            the array containing x coordinates of the ith Gaussian)
+            -Ye:    a list of y coordinates for the ellipses
+
+        Example:
+            Suppose we have w, mu and va as parameters for a mixture, then:
+            
+            gm      = GM(d, k)
+            gm.set_param(w, mu, va)
+            X       = gm.sample(1000)
+            Xe, Ye  = gm.conf_ellipsoids()
+            pylab.plot(X[:,0], X[:, 1], '.')
+            for k in len(w):
+                pylab.plot(Xe[k], Ye[k], 'r')
+                
+            Will plot samples X draw from the mixture model, and
+            plot the ellipses of equi-probability from the mean with
+            fixed level of confidence 0.39.  """
+        if not self.is_valid:
+            raise GmParamError("""Parameters of the model has not been 
+                set yet, please set them using self.set_param()""")
+
+        Xe  = []
+        Ye  = []   
+        if self.mode == 'diag':
+            for i in range(self.k):
+                xe, ye  = densities.gauss_ell(self.mu[i,:], self.va[i,:], 
+                        *args, **kargs)
+                Xe.append(xe)
+                Ye.append(ye)
+        elif self.mode == 'full':
+            for i in range(self.k):
+                xe, ye  = densities.gauss_ell(self.mu[i,:], 
+                        self.va[i*self.d:i*self.d+self.d,:], 
+                        *args, **kargs)
+                Xe.append(xe)
+                Ye.append(ye)
+
+        return Xe, Ye
+    
+    def check_state(self):
+        """
+        """
+        if not self.is_valid:
+            raise GmParamError("""Parameters of the model has not been 
+                set yet, please set them using self.set_param()""")
+
+        if self.mode == 'full':
+            raise NotImplementedError, "not implemented for full mode yet"
+        
+        # # How to check w: if one component is negligeable, what shall
+        # # we do ?
+        # M   = N.max(self.w)
+        # m   = N.min(self.w)
+
+        # maxc    = m / M
+
+        # Check condition number for cov matrix
+        cond    = N.zeros(self.k)
+        ava     = N.absolute(self.va)
+        for c in range(self.k):
+            cond[c] = N.amax(ava[c,:]) / N.amin(ava[c,:])
+
+        print cond
+
+    def gen_param(self, d, nc, varmode = 'diag', spread = 1):
+        """Generate valid parameters for a gaussian mixture model.
+        d is the dimension, nc the number of components, and varmode
+        the mode for cov matrices.
+
+        This is a class method.
+
+        Returns: w, mu, va
+        """
+        w   = abs(randn(nc))
+        w   = w / sum(w, 0)
+
+        mu  = spread * randn(nc, d)
+        if varmode == 'diag':
+            va  = abs(randn(nc, d))
+        elif varmode == 'full':
+            va  = randn(nc * d, d)
+            for k in range(nc):
+                va[k*d:k*d+d]   = N.dot( va[k*d:k*d+d], 
+                    va[k*d:k*d+d].transpose())
+        else:
+            raise GmParamError('cov matrix mode not recognized')
+
+        return w, mu, va
+
+    gen_param = classmethod(gen_param)
+
+    def plot(self, *args, **kargs):
+        """Plot the ellipsoides directly for the model
+        
+        Returns a list of lines, so that their style can be modified. By default,
+        the style is red color, and nolegend for all of them.
+        
+        Does not work for 1d"""
+        if not self.is_valid:
+            raise GmParamError("""Parameters of the model has not been 
+                set yet, please set them using self.set_param()""")
+
+        k       = self.k
+        Xe, Ye  = self.conf_ellipses(*args, **kargs)
+        try:
+            import pylab as P
+            return [P.plot(Xe[i], Ye[i], 'r', label='_nolegend_')[0] for i in range(k)]
+            #for i in range(k):
+            #    P.plot(Xe[i], Ye[i], 'r')
+        except ImportError:
+            raise GmParamError("matplotlib not found, cannot plot...")
+
+    def plot1d(self, level = 0.5, fill = 0, gpdf = 0):
+        """This function plots the pdfs of each component of the model. 
+        If gpdf is 1, also plots the global pdf. If fill is 1, fill confidence
+        areas using level argument as a level value
+        
+        Returns a dictionary h of plot handles so that their properties can
+        be modified (eg color, label, etc...):
+            - h['pdf'] is a list of lines, one line per component pdf
+            - h['gpdf'] is the line for the global pdf
+            - h['conf'] is a list of filling area
+        """
+        # This is not optimized at all, may be slow. Should not be
+        # difficult to make much faster, but it is late, and I am lazy
+        if not self.d == 1:
+            raise GmParamError("the model is not one dimensional model")
+        from scipy.stats import norm
+        nrm     = norm(0, 1)
+        pval    = N.sqrt(self.va[:,0]) * nrm.ppf((1+level)/2)
+
+        # Compute reasonable min/max for the normal pdf
+        mc  = 3
+        std = N.sqrt(self.va[:,0])
+        m   = N.amin(self.mu[:, 0] - mc * std)
+        M   = N.amax(self.mu[:, 0] + mc * std)
+
+        np  = 500
+        x   = N.linspace(m, M, np)
+        Yf  = N.zeros(np)
+        Yt  = N.zeros(np)
+
+        # Prepare the dic of plot handles to return
+        ks  = ['pdf', 'conf', 'gpdf']
+        hp  = dict((i,[]) for i in ks)
+        try:
+            import pylab as P
+            for c in range(self.k):
+                y   = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
+                        N.exp(-(x-self.mu[c][0])**2/(2*std[c]**2))
+                Yt  += y
+                h   = P.plot(x, y, 'r', label ='_nolegend_')
+                hp['pdf'].extend(h)
+                if fill:
+                    #P.axvspan(-pval[c] + self.mu[c][0], pval[c] + self.mu[c][0], 
+                    #        facecolor = 'b', alpha = 0.2)
+                    id1 = -pval[c] + self.mu[c]
+                    id2 = pval[c] + self.mu[c]
+                    xc  = x[:, N.where(x>id1)[0]]
+                    xc  = xc[:, N.where(xc<id2)[0]]
+                    Yf  = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
+                            N.exp(-(xc-self.mu[c][0])**2/(2*std[c]**2))
+                    xc  = N.concatenate(([xc[0]], xc, [xc[-1]]))
+                    Yf  = N.concatenate(([0], Yf, [0]))
+                    h   = P.fill(xc, Yf, 
+                            facecolor = 'b', alpha = 0.1, label='_nolegend_')
+                    hp['conf'].extend(h)
+                    #P.fill([xc[0], xc[0], xc[-1], xc[-1]], 
+                    #        [0, Yf[0], Yf[-1], 0], facecolor = 'b', alpha = 0.2)
+            if gpdf:
+                h           = P.plot(x, Yt, 'r:', label='_nolegend_')
+                hp['gpdf']  = h
+            return hp
+        except ImportError:
+            raise GmParamError("matplotlib not found, cannot plot...")
+
+# Function to generate a random index: this is kept outside any class,
+# as the function can be useful for other
+def gen_rand_index(p, n):
+    """Generate a N samples vector containing random index between 1 
+    and length(p), each index i with probability p(i)"""
+    # TODO Check args here
+    
+    # TODO: check each value of inverse distribution is
+    # different
+    invcdf  = N.cumsum(p)
+    uni     = rand(n)
+    index   = N.zeros(n, dtype=int)
+
+    # This one should be a bit faster
+    for k in range(len(p)-1, 0, -1):
+        blop        = N.where(N.logical_and(invcdf[k-1] <= uni, 
+                    uni < invcdf[k]))
+        index[blop] = k
+        
+    return index
+
+def check_gmm_param(w, mu, va):
+    """Check that w, mu and va are valid parameters for
+    a mixture of gaussian: w should sum to 1, there should
+    be the same number of component in each param, the variances
+    should be positive definite, etc... 
+    
+    Params:
+        w   = vector or list of weigths of the mixture (K elements)
+        mu  = matrix: K * d
+        va  = list of variances (vector K * d or square matrices Kd * d)
+
+    returns:
+        K   = number of components
+        d   = dimension
+        mode    = 'diag' if diagonal covariance, 'full' of full matrices
+    """
+        
+    # Check that w is valid
+    if N.fabs(N.sum(w, 0)  - 1) > MAX_DEV:
+        raise GmParamError('weight does not sum to 1')
+    
+    if not len(w.shape) == 1:
+        raise GmParamError('weight is not a vector')
+
+    # Check that mean and va have the same number of components
+    K           = len(w)
+
+    if N.ndim(mu) < 2:
+        msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
+        raise GmParamError(msg)
+    if N.ndim(va) < 2:
+        msg = """va should be a K,d / K *d, d matrix, and a row vector if
+        only 1 diag comp"""
+        raise GmParamError(msg)
+
+    (Km, d)     = mu.shape
+    (Ka, da)    = va.shape
+
+    if not K == Km:
+        msg = "not same number of component in mean and weights"
+        raise GmParamError(msg)
+
+    if not d == da:
+        msg = "not same number of dimensions in mean and variances"
+        raise GmParamError(msg)
+
+    if Km == Ka:
+        mode = 'diag'
+    else:
+        mode = 'full'
+        if not Ka == Km*d:
+            msg = "not same number of dimensions in mean and variances"
+            raise GmParamError(msg)
+        
+    return K, d, mode
+        
+if __name__ == '__main__':
+    # Meta parameters:
+    #   - k = number of components
+    #   - d = dimension
+    #   - mode : mode of covariance matrices
+    d       = 5
+    k       = 4
+
+    # Now, drawing a model
+    mode    = 'full'
+    nframes = 1e3
+
+    # Build a model with random parameters
+    w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
+    gm          = GM.fromvalues(w, mu, va)
+
+    # Sample nframes frames  from the model
+    X   = gm.sample(nframes)
+
+    # Plot the data
+    import pylab as P
+    P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
+
+    # Real confidence ellipses with confidence level 
+    level       = 0.50
+    h           = gm.plot(level=level)
+
+    # set the first ellipse label, which will appear in the legend
+    h[0].set_label('confidence ell at level ' + str(level))
+
+    P.legend(loc = 0)
+    P.show()

Added: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/gmm_em.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,394 @@
+# /usr/bin/python
+# Last Change: Fri Oct 06 05:00 PM 2006 J
+
+# TODO:
+#   - which methods to avoid va shrinking to 0 ? There are several options, 
+#   not sure which ones are appropriates
+#   - improve EM trainer
+#   - online EM
+
+import numpy as N
+import numpy.linalg as lin
+from numpy.random import randn
+#import _c_densities as densities
+import densities
+from kmean import kmean
+from gauss_mix import GM
+
+# Error classes
+class GmmError(Exception):
+    """Base class for exceptions in this module."""
+    pass
+
+class GmmParamError:
+    """Exception raised for errors in gmm params
+
+    Attributes:
+        expression -- input expression in which the error occurred
+        message -- explanation of the error
+    """
+    def __init__(self, message):
+        self.message    = message
+    
+    def __str__(self):
+        return self.message
+
+# Not sure yet about how to design different mixture models. Most of the code 
+# is different # (pdf, update part of EM, etc...) and I am not sure it makes 
+# sense to use inheritance for # interface specification in python, since its 
+# dynamic type systeme.
+
+# Anyway, a mixture class should encapsulates all details concerning a mixture model:
+#   - internal parameters for the pdfs
+#   - can compute sufficient statistics for EM
+#   - can sample a model
+#   - can generate random valid parameters for a new pdf (using class method)
+class MixtureModel:
+    pass
+
+class ExpMixtureModel(MixtureModel):
+    """Class to model mixture of exponential pdf (eg Gaussian, exponential, Laplace, 
+    etc..). This is a special case because some parts of EM are common for those
+    models..."""
+    pass
+
+class GMM(ExpMixtureModel):
+    """ A class to model a Gaussian Mixture Model (GMM). An instance of 
+    this class is created by giving weights, mean and variances in the ctor.
+    An instanciated object can be sampled, trained by EM. 
+    
+    The class method gen_model can be used without instanciation."""
+
+    def init_kmean(self, data, niter = 5):
+        """ Init the model with kmean."""
+        k       = self.gm.k
+        d       = self.gm.d
+        init    = data[0:k, :]
+
+        (code, label)   = kmean(data, init, niter)
+
+        w   = N.ones(k) / k
+        mu  = code.copy()
+        if self.gm.mode == 'diag':
+            va = N.zeros((k, d))
+            for i in range(k):
+                for j in range(d):
+                    va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
+        elif self.gm.mode == 'full':
+            va  = N.zeros((k*d, d))
+            for i in range(k):
+                va[i*d:i*d+d,:] = \
+                    N.cov(data[N.where(label==i)], rowvar = 0)
+        else:
+            raise GmmParamError("mode " + str(mode) + " not recognized")
+
+        self.gm.set_param(w, mu, va)
+
+    def init_random(self, data):
+        """ Init the model at random."""
+        k   = self.gm.k
+        d   = self.gm.d
+        if mode == 'diag':
+            w   = N.ones(k) / k
+            mu  = randn(k, d)
+            va  = N.fabs(randn(k, d))
+        else:
+            raise GmmParamError("""init_random not implemented for
+                    mode %s yet""", mode)
+
+        self.gm.set_param(w, mu, va)
+
+    # TODO: 
+    #   - format of parameters ? For variances, list of variances matrix,
+    #   keep the current format, have 3d matrices ?
+    #   - To handle the different modes, we could do something "fancy" such as
+    #   replacing methods, to avoid checking cases everywhere and unconsistency.
+    def __init__(self, gm, init = 'kmean'):
+        """ Initialize a GMM with weight w, mean mu and variances va, and initialization
+        method for training init (kmean by default)"""
+        self.gm = gm
+
+        # Possible init methods
+        init_methods = {'kmean': self.init_kmean, 'random' : self.init_random}
+
+        if init not in init_methods:
+            raise GmmParamError('init method %s not recognized' + str(init))
+
+        self.init   = init_methods[init]
+
+    def sufficient_statistics(self, data):
+        """ Return normalized and non-normalized sufficient statistics
+        from the model.
+        
+        Computes the latent variable distribution (a 
+        posteriori probability) knowing the explicit data 
+        for the Gaussian model (w, mu, var): gamma(t, i) = 
+            P[state = i | observation = data(t); w, mu, va]
+
+        This is basically the E step of EM for GMM."""
+        n   = data.shape[0]
+
+        # compute the gaussian pdf
+        tgd	= multiple_gauss_den(data, self.gm.mu, self.gm.va)
+        # multiply by the weight
+        tgd	*= self.gm.w
+        # Normalize to get a pdf
+        gd	= tgd  / N.sum(tgd, axis=1)[:, N.newaxis]
+
+        return gd, tgd
+
+    def update_em(self, data, gamma):
+        """Computes update of the Gaussian Mixture Model (M step)
+        from the a posteriori pdf, computed by gmm_posterior
+        (E step).
+        """
+        k       = self.gm.k
+        d       = self.gm.d
+        n       = data.shape[0]
+        invn    = 1.0/n
+        mGamma  = N.sum(gamma, axis = 0)
+
+        if self.gm.mode == 'diag':
+            mu      = N.zeros((k, d))
+            va      = N.zeros((k, d))
+            gamma   = gamma.T
+            for c in range(k):
+                x   = N.dot(gamma[c:c+1,:], data)[0,:]
+                xx  = N.dot(gamma[c:c+1,:], data ** 2)[0,:]
+
+                mu[c,:] = x / mGamma[c]
+                va[c,:] = xx  / mGamma[c] - mu[c,:] ** 2
+            w   = invn * mGamma
+
+        elif self.gm.mode == 'full':
+            # In full mode, this is the bottleneck: the triple loop
+            # kills performances. This is pretty straightforward
+            # algebra, so computing it in C should not be too difficult
+            mu  = N.zeros((k, d))
+            va  = N.zeros((k*d, d))
+
+            gamma   = gamma.transpose()
+            for c in range(k):
+                #x   = N.sum(N.outer(gamma[:, c], 
+                #            N.ones((1, d))) * data, axis = 0)
+                x   = N.dot(gamma[c:c+1,:], data)[0,:]
+                xx  = N.zeros((d, d))
+                
+                # This should be much faster than recursing on n...
+                for i in range(d):
+                    for j in range(d):
+                        xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[c,:], axis = 0)
+
+                mu[c,:] = x / mGamma[c]
+                va[c*d:c*d+d,:] = xx  / mGamma[c] - \
+                                    N.outer(mu[c,:], mu[c,:])
+            w   = invn * mGamma
+        else:
+            raise GmmParamError("varmode not recognized")
+
+        self.gm.set_param(w, mu, va)
+
+class EM:
+    """An EM trainer. An EM trainer
+    trains from data, with a model
+    
+    Not really useful yet"""
+    def __init__(self):
+        pass
+    
+    def train(self, data, model, maxiter = 10, thresh = 1e-5):
+        """
+        Train a model using data, and stops when the likelihood fails
+        behind a threshold, or when the number of iterations > niter, 
+        whichever comes first
+
+        Args:
+            - data:     contains the observed features, one row is one frame, ie one 
+            observation of dimension d
+            - model:    object of class Mixture
+            - maxiter:  maximum number of iterations
+
+        The model is trained, and its parameters updated accordingly.
+
+        Returns:
+            likelihood (one value per iteration).
+        """
+
+        # Initialize the data (may do nothing depending on the model)
+        model.init(data)
+
+        # Likelihood is kept
+        like    = N.zeros(maxiter)
+
+        # Em computation, with computation of the likelihood
+        g, tgd      = model.sufficient_statistics(data)
+        like[0]     = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+        model.update_em(data, g)
+        for i in range(1, maxiter):
+            g, tgd      = model.sufficient_statistics(data)
+            like[i]     = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+            model.update_em(data, g)
+            if has_em_converged(like[i], like[i-1], thresh):
+                return like[0:i]
+
+        return like
+    
+# Misc functions
+def has_em_converged(like, plike, thresh):
+    """ given likelihood of current iteration like and previous
+    iteration plike, returns true is converged: based on comparison
+    of the slope of the likehood with thresh"""
+    diff    = N.abs(like - plike)
+    avg     = 0.5 * (N.abs(like) + N.abs(plike))
+    if diff / avg < thresh:
+        return True
+    else:
+        return False
+
+def multiple_gauss_den(data, mu, va):
+    """Helper function to generate several Gaussian
+    pdf (different parameters) from the same data"""
+    mu  = N.atleast_2d(mu)
+    va  = N.atleast_2d(va)
+
+    K   = mu.shape[0]
+    n   = data.shape[0]
+    d   = data.shape[1]
+    
+    y   = N.zeros((K, n))
+    if mu.size == va.size:
+        for i in range(K):
+            y[i] = densities.gauss_den(data, mu[i, :], va[i, :])
+        return y.T
+    else:
+        for i in range(K):
+            y[:, i] = densities.gauss_den(data, mu[i, :], 
+                        va[d*i:d*i+d, :])
+
+    return y
+
+if __name__ == "__main__":
+    import copy
+    #=============================
+    # Simple GMM with 5 components
+    #=============================
+
+    #+++++++++++++++++++++++++++++
+    # Meta parameters of the model
+    #   - k: Number of components
+    #   - d: dimension of each Gaussian
+    #   - mode: Mode of covariance matrix: full or diag
+    #   - nframes: number of frames (frame = one data point = one
+    #   row of d elements
+    k       = 2 
+    d       = 1
+    mode    = 'full'
+    nframes = 1e3
+
+    #+++++++++++++++++++++++++++++++++++++++++++
+    # Create an artificial GMM model, samples it
+    #+++++++++++++++++++++++++++++++++++++++++++
+    print "Generating the mixture"
+    # Generate a model with k components, d dimensions
+    w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
+    gm          = GM(d, k, mode)
+    gm.set_param(w, mu, va)
+
+    # Sample nframes frames  from the model
+    data    = gm.sample(nframes)
+
+    #++++++++++++++++++++++++
+    # Learn the model with EM
+    #++++++++++++++++++++++++
+
+    # Init the model
+    print "Init a model for learning, with kmean for initialization"
+    lgm = GM(d, k, mode)
+    gmm = GMM(lgm, 'kmean')
+    gmm.init(data)
+
+    # Keep the initialized model for drawing
+    gm0 = copy.copy(lgm)
+
+    # The actual EM, with likelihood computation
+    niter   = 10
+    like    = N.zeros(niter)
+
+    print "computing..."
+    for i in range(niter):
+        g, tgd  = gmm.sufficient_statistics(data)
+        like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+        gmm.update_em(data, g)
+    # # Alternative form, by using EM class: as the EM class
+    # # is quite rudimentary now, it is not very useful, just save
+    # # a few lines
+    # em      = EM()
+    # like    = em.train(data, gmm, niter)
+
+    #+++++++++++++++
+    # Draw the model
+    #+++++++++++++++
+    print "drawing..."
+    import pylab as P
+    P.subplot(2, 1, 1)
+
+    if not d == 1:
+        # Draw what is happening
+        P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
+
+        # Real confidence ellipses
+        Xre, Yre  = gm.conf_ellipses()
+        P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
+        for i in range(1,k):
+            P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+
+        # Initial confidence ellipses as found by kmean
+        X0e, Y0e  = gm0.conf_ellipses()
+        P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
+        for i in range(1,k):
+            P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
+
+        # Values found by EM
+        Xe, Ye  = lgm.conf_ellipses()
+        P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
+        for i in range(1,k):
+            P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
+        P.legend(loc = 0)
+    else:
+        # Real confidence ellipses
+        h   = gm.plot1d()
+        [i.set_color('g') for i in h['pdf']]
+        h['pdf'][0].set_label('true pdf')
+
+        # Initial confidence ellipses as found by kmean
+        h0  = gm0.plot1d()
+        [i.set_color('k') for i in h0['pdf']]
+        h0['pdf'][0].set_label('initial pdf')
+
+        # Values found by EM
+        hl  = lgm.plot1d(fill = 1, level = 0.66)
+        [i.set_color('r') for i in hl['pdf']]
+        hl['pdf'][0].set_label('pdf found by EM')
+
+        P.legend(loc = 0)
+
+    P.subplot(2, 1, 2)
+    P.plot(like)
+    P.title('log likelihood')
+
+    # #++++++++++++++++++
+    # # Export the figure
+    # #++++++++++++++++++
+    # F   = P.gcf()
+    # DPI = F.get_dpi()
+    # DefaultSize = F.get_size_inches()
+    # # the default is 100dpi for savefig:
+    # F.savefig("example1.png")
+
+    # # Now make the image twice as big, while keeping the fonts and all the
+    # # same size
+    # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) )
+    # Size = F.get_size_inches()
+    # print "Size in Inches", Size
+    # F.savefig("example2.png")
+    P.show()

Added: trunk/Lib/sandbox/pyem/info.py
===================================================================
--- trunk/Lib/sandbox/pyem/info.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/info.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,35 @@
+"""
+Routines for Gaussian Mixture Models
+and learning with Expectation Maximization 
+==========================================
+
+This module contains classes and function to compute multivariate Gaussian densities
+(diagonal and full covariance matrices), Gaussian mixtures, Gaussian mixtures models
+and an Em trainer.
+
+More specifically, the module contains the following file:
+
+- densities.py: functions to compute multivariate Gaussian pdf and ellipsoides of
+confidence (gauss_den)
+- gauss_mix.py: defines the GM (Gaussian Mixture) class. A Gaussian Mixture can be
+created from its parameters weights, mean and variances, or from its meta parameters
+d (dimension of the Gaussian) and k (number of components in the mixture). A Gaussian
+Model can then be sampled or plot (if d>1, plot confidence ellipsoids projected on 
+2 chosen dimensions, if d == 1, plot the pdf of each component and fill the zone
+of confidence for a given level)
+- gmm_em.py: defines a class GMM (Gaussian Mixture Model). This class is constructed
+from a GM model gm, and can be used to train gm. The GMM can be initiated by
+kmean or at random, and can compute sufficient statistics, and update its parameters
+from the sufficient statistics.
+- kmean.py: implements a kmean algorithm. We cannot use scipy.cluster.vq kmeans, since
+its does not give membership of observations.
+
+Example of use: simply execute gmm_em.py for an example of training a GM and plotting
+the results compared to true values
+
+Copyright: David Cournapeau 2006
+License: BSD-style (see LICENSE.txt in main source directory)
+"""
+
+depends = ['linalg', 'stats']
+ignore  = False

Added: trunk/Lib/sandbox/pyem/kmean.py
===================================================================
--- trunk/Lib/sandbox/pyem/kmean.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/kmean.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,76 @@
+# /usr/bin/python
+# Last Change: Thu Sep 28 01:00 PM 2006 J
+
+#TODO:
+#   - a demo for kmeans
+
+import numpy as N
+
+def _py_vq(data, code):
+    """ Please do not use directly. Use kmean instead"""
+    # No attempt to be efficient has been made...
+    (n, d)  = data.shape
+    (k, d)  = code.shape
+
+    label   = N.zeros(n, int)
+    for i in range(n):
+        d           = N.sum((data[i, :] - code) ** 2, 1)
+        label[i]    = N.argmin(d)
+
+    return label
+    
+# Try to import pyrex function for vector quantization. If not available,
+# falls back on pure python implementation.
+#%KMEANIMPORT%
+#try:
+#    from scipy.cluster.vq import kmeans as kmean
+#except ImportError:
+#    try:
+#        from c_gmm import _vq
+#    except:
+#        print """c_gmm._vq not found, using pure python implementation instead. 
+#        Kmean will be REALLY slow"""
+#        _vq = _py_vq
+try:
+    from scipy.cluster.vq import vq
+    print "using scipy.cluster.vq"
+    def _vq(*args, **kw): return vq(*args, **kw)[0]
+except ImportError:
+    try:
+        from c_gmm import _vq
+        print "using pyrex vq"
+    except ImportError:
+        print """c_gmm._vq not found, using pure python implementation instead. 
+        Kmean will be REALLY slow"""
+        _vq = _py_vq
+
+def kmean(data, init, iter = 10):
+    """Simple kmean implementation for EM. Runs iter iterations.
+    
+    returns a tuple (code, label), where code are the final
+    centroids, and label are the class label indec for each
+    frame (ie row) of data"""
+
+    data    = N.atleast_2d(data)
+    init    = N.atleast_2d(init)
+
+    (n, d)  = data.shape
+    (k, d1) = init.shape
+
+    if not d == d1:
+        msg = "data and init centers do not have same dimensions..."
+        raise GmmParamError(msg)
+    
+    code    = N.asarray(init.copy())
+    for i in range(iter):
+        # Compute the nearest neighbour for each obs
+        # using the current code book
+        label   = _vq(data, code)
+        # Update the code by computing centroids using the new code book
+        for j in range(k):
+            code[j,:] = N.mean(data[N.where(label==j)], axis=0) 
+
+    return code, label
+
+if __name__ == "__main__":
+    pass

Added: trunk/Lib/sandbox/pyem/online_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/online_em.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/online_em.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,369 @@
+# /usr/bin/python
+# Last Change: Thu Oct 12 09:00 PM 2006 J
+
+#---------------------------------------------
+# This is not meant to be used yet !!!! I am 
+# not sure how to integrate this stuff inside
+# the package yet. The cases are:
+#   - we have a set of data, and we want to test online EM 
+#   compared to normal EM 
+#   - we do not have all the data before putting them in online EM:
+#   eg current frame depends on previous frame in some way.
+
+import numpy as N
+from numpy import repmat, mean
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+from gmm_em import ExpMixtureModel, GMM, EM, multiple_gauss_den
+from gauss_mix import GM
+from kmean import kmean
+
+print "======================================================"
+import traceback
+f = traceback.extract_stack()
+print f[0][0] + " This is beta code, don't use it !        "
+print "======================================================"
+
+# Error classes
+class OnGmmError(Exception):
+    """Base class for exceptions in this module."""
+    pass
+
+class OnGmmParamError:
+    """Exception raised for errors in gmm params
+
+    Attributes:
+        expression -- input expression in which the error occurred
+        message -- explanation of the error
+    """
+    def __init__(self, message):
+        self.message    = message
+    
+    def __str__(self):
+        return self.message
+
+class OnGMM(ExpMixtureModel):
+    def init_kmean(self, init_data, niter = 5):
+        """ Init the model at random."""
+        k   = self.gm.k
+        d   = self.gm.d
+        if mode == 'diag':
+            w           = N.ones(k) / k
+
+            # Init the internal state of EM
+            self.cx     = N.outer(w, mean(init_data, 0))
+            self.cxx    = N.outer(w, mean(init_data ** 2, 0))
+
+            # w, mu and va init is the same that in the standard case
+            (code, label)   = kmean(init_data, init_data[0:k, :], niter)
+            mu          = code.copy()
+            va          = N.zeros((k, d))
+            for i in range(k):
+                for j in range(d):
+                    va [i,j] = N.cov(init_data[N.where(label==i), j], rowvar = 0)
+        else:
+            raise OnGmmParamError("""init_online not implemented for
+                    mode %s yet""", mode)
+
+        self.gm.set_param(w, mu, va)
+        self.cw     = w
+        self.cmu    = mu
+        self.cva    = va
+
+        self.pw     = self.cw.copy()
+        self.pmu    = self.cmu.copy()
+        self.pva    = self.cva.copy()
+
+    def __init__(self, gm, init_data, init = 'kmean'):
+        self.gm = gm
+        
+        # Possible init methods
+        init_methods = {'kmean' : self.init_kmean}
+
+        self.init   = init_methods[init]
+
+    def sufficient_statistics(self, frame, nu):
+        """ sufficient_statistics(frame, nu)
+        
+        frame has to be rank 2 !"""
+        gamma   = multiple_gauss_den(frame, self.pmu, self.pva)[0]
+        gamma   *= self.pw
+        gamma   /= N.sum(gamma)
+        # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
+        self.cw	= (1 - nu) * self.cw + nu * gamma
+
+        return gamma
+
+    def update_em(self, frame, gamma, nu):
+        cx  = self.cx
+        cxx = self.cxx
+        cmu = self.cmu
+        cva = self.cva
+        for k in range(self.gm.k):
+            cx[k]   = (1 - nu) * cx[k] + nu * frame * gamma[k]
+            cxx[k]  = (1 - nu) * cxx[k] + nu * frame ** 2 * gamma[k]
+
+            cmu[k]  = cx[k] / cw[k]
+            cva[k]  = cxx[k] / cw[k] - cmu[k] ** 2
+    
+if __name__ == "__main__":
+    import copy
+    #=============================
+    # Simple GMM with 2 components
+    #=============================
+
+    #+++++++++++++++++++++++++++++
+    # Meta parameters of the model
+    #   - k: Number of components
+    #   - d: dimension of each Gaussian
+    #   - mode: Mode of covariance matrix: full or diag
+    #   - nframes: number of frames (frame = one data point = one
+    #   row of d elements
+    k       = 2 
+    d       = 1
+    mode    = 'diag'
+    nframes = int(1e3)
+    emiter  = 2
+
+    #+++++++++++++++++++++++++++++++++++++++++++
+    # Create an artificial GMM model, samples it
+    #+++++++++++++++++++++++++++++++++++++++++++
+    print "Generating the mixture"
+    # Generate a model with k components, d dimensions
+    w, mu, va   = GM.gen_param(d, k, mode, spread = 1.5)
+    gm          = GM.fromvalues(w, mu, va)
+
+    # Sample nframes frames  from the model
+    data    = gm.sample(nframes, )
+
+    #++++++++++++++++++++++++
+    # Learn the model with EM
+    #++++++++++++++++++++++++
+
+    # Init the model
+    print "Init a model for learning, with kmean for initialization"
+    lgm = GM(d, k, mode)
+    gmm = GMM(lgm, 'kmean')
+    gmm.init(data)
+
+    # Keep the initialized model for drawing
+    gm0 = copy.copy(lgm)
+
+    # The actual EM, with likelihood computation
+    like    = N.zeros(emiter)
+    print "computing..."
+    for i in range(emiter):
+        g, tgd  = gmm.sufficient_statistics(data)
+        like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+        gmm.update_em(data, g)
+
+    #+++++++++++++++++++++++++++++++
+    # Learn the model with Online EM
+    #+++++++++++++++++++++++++++++++
+    ogm     = GM(d, k, mode)
+    ogmm    = OnGMM(ogm, 'kmean')
+    #init_data   = data[0:nframes / 20, :]
+    init_data   = data
+    ogmm.init(init_data)
+
+    ogm2    = GM(d, k, mode)
+    ogmm2   = OnGMM(ogm2, 'kmean')
+    #init_data   = data[0:nframes / 20, :]
+    init_data   = data
+    ogmm2.init(init_data)
+
+    ogm0    = copy.copy(ogm)
+    assert_array_equal(ogm0.w, gm0.w)
+    assert_array_equal(ogm0.mu, gm0.mu)
+    assert_array_equal(ogm0.va, gm0.va)
+
+    ogm02   = copy.copy(ogm2)
+    assert_array_equal(ogm02.w, gm0.w)
+    assert_array_equal(ogm02.mu, gm0.mu)
+    assert_array_equal(ogm02.va, gm0.va)
+
+    w0  = gm0.w.copy()
+    mu0 = gm0.mu.copy()
+    va0 = gm0.va.copy()
+
+    cx  = ogmm.cx
+    cxx = ogmm.cxx
+    
+    cw  = w0.copy()
+    cmu = mu0.copy()
+    cva = va0.copy()
+    
+    pw  = cw.copy()
+    pmu = cmu.copy()
+    pva = cva.copy()
+
+    # Forgetting param
+    ku		= 0.005
+    t0		= 200
+    lamb	= N.ones((nframes, 1))
+    lamb[0] = 0
+    nu0		= 1.0
+    #lamb	= 1 - 1/(N.arange(-1, nframes-1) * ku + t0)
+    #nu0		= 0.2
+    nu		= N.zeros((len(lamb), 1))
+    nu[0]	= nu0
+    for i in range(1, len(lamb)):
+        nu[i]	= 1./(1 + lamb[i] / nu[i-1])
+
+    gamma   = N.zeros((nframes, k))
+    agamma  = []
+    apw     = []
+    apmu    = []
+    apva    = []
+    print "========== Online Manual ==========="
+    for e in range(emiter):
+        print "online manual: estep %d, printing p* state " % e
+        apw.append(pw.copy())
+        apmu.append(pmu.copy())
+        apva.append(pva.copy())
+        for t in range(nframes):
+            gamma[t]    = multiple_gauss_den(data[t:t+1, :], pmu, pva)[0]
+            gamma[t]    *= pw
+            gamma[t]    /= N.sum(gamma[t])
+            # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
+            # <x>(t) = cx(t)
+            cw	= (1 - nu[t]) * cw + nu[t] * gamma[t]
+            # loop through each component
+            for i in range(k):
+                cx[i]   = (1 - nu[t]) * cx[i] + nu[t] * data[t, :] * gamma[t, i]
+                cxx[i]  = (1 - nu[t]) * cxx[i] + nu[t] * data[t, :] ** 2 * gamma[t, i]
+
+                cmu[i]  = cx[i] / cw[i]
+                cva[i]  = cxx[i] / cw[i] - cmu[i] ** 2
+
+            #pw  = cw.copy()
+            #pmu = cmu.copy()
+            #pva = cva.copy()
+        print "gamma[end]: " + str(gamma[-1])
+        pw  = cw.copy()
+        pmu = cmu.copy()
+        pva = cva.copy()
+        agamma.append(gamma.copy())
+
+    gamma2  = N.zeros((nframes, k))
+    agamma2 = []
+    apw2    = []
+    apmu2   = []
+    apva2   = []
+    print "========== Online Automatic ==========="
+    for e in range(emiter):
+        print "online automatic: estep %d, printing p* state " % e
+        apw2.append(ogmm2.pw.copy())
+        apmu2.append(ogmm2.pmu.copy())
+        apva2.append(ogmm2.pva.copy())
+        for t in range(nframes):
+            gamma2[t]   = ogmm2.sufficient_statistics(data[t:t+1, :], nu[t])
+            #gamma2[t]   = multiple_gauss_den(data[t:t+1, :], ogmm2.pmu, ogmm2.pva)[0]
+            #gamma2[t]   *= ogmm2.pw
+            #gamma2[t]   /= N.sum(gamma2[t])
+            #try:
+            #    assert_array_equal(agamma, gamma2[t])
+            #except AssertionError:
+            #    print "error for estep %d, step %d" % (e, t)
+            #    print ogmm2.pw
+            #    print ogmm2.pmu
+            #    print ogmm2.pva
+            #    raise 
+            ogmm2.update_em(data[t, :], gamma2[t], nu[t])
+            #ogmm2.cw	= (1 - nu[t]) * ogmm2.cw + nu[t] * agamma
+            ## loop through each component
+            #for i in range(k):
+            #    ogmm2.cx[i]   = (1 - nu[t]) * ogmm2.cx[i] + nu[t] * data[t, :] * agamma[i]
+            #    ogmm2.cxx[i]  = (1 - nu[t]) * ogmm2.cxx[i] + nu[t] * data[t, :] ** 2 * agamma[i]
+
+            #    ogmm2.cmu[i]  = ogmm2.cx[i] / ogmm2.cw[i]
+            #    ogmm2.cva[i]  = ogmm2.cxx[i] / ogmm2.cw[i] - ogmm2.cmu[i] ** 2
+
+        print "gamma[end]: " + str(gamma2[-1])
+        agamma2.append(gamma2.copy())
+        ogmm2.pw  = ogmm2.cw.copy()
+        ogmm2.pmu = ogmm2.cmu.copy()
+        ogmm2.pva = ogmm2.cva.copy()
+
+    # #ogm.set_param(pw, pmu, pva)
+    # print "weights off vs on: \n" + str(lgm.w) + "\n " + str(cw)
+    # print "mean off vs on: \n" + str(lgm.mu) + "\n " + str(cmu)
+    # print "variances off vs on: \n" + str(lgm.va) + "\n " + str(cva)
+    # print "weights off vs on2: \n" + str(lgm.w) + "\n " + str(ogmm2.cw)
+    # print "mean off vs on2: \n" + str(lgm.mu) + "\n " + str(ogmm2.cmu)
+    # print "variances off vs on2: \n" + str(lgm.va) + "\n " + str(ogmm2.cva)
+    # assert_array_almost_equal(cw, lgm.w)
+    # assert_array_almost_equal(cmu, lgm.mu)
+    # assert_array_almost_equal(cva, lgm.va)
+    assert_array_equal(ogmm2.pw, pw)
+    assert_array_equal(ogmm2.pmu, pmu)
+    assert_array_equal(ogmm2.pva, pva)
+    assert_array_equal(agamma[0], agamma2[0])
+    #assert_array_almost_equal(ogmm2.cw, lgm.w)
+    #assert_array_almost_equal(ogmm2.cmu, lgm.mu)
+    #assert_array_almost_equal(ogmm2.cva, lgm.va)
+    # #+++++++++++++++
+    # # Draw the model
+    # #+++++++++++++++
+    # print "drawing..."
+    # import pylab as P
+    # P.subplot(2, 1, 1)
+
+    # if d == 1:
+    #     # Real confidence ellipses
+    #     h   = gm.plot1d()
+    #     [i.set_color('g') for i in h['pdf']]
+    #     h['pdf'][0].set_label('true pdf')
+
+    #     # Initial confidence ellipses as found by kmean
+    #     h0  = gm0.plot1d()
+    #     [i.set_color('k') for i in h0['pdf']]
+    #     h0['pdf'][0].set_label('initial pdf')
+
+    #     # Values found by EM
+    #     hl  = lgm.plot1d(fill = 1, level = 0.66)
+    #     [i.set_color('r') for i in hl['pdf']]
+    #     hl['pdf'][0].set_label('pdf found by EM')
+
+    #     # Values found by Online EM
+    #     ho  = ogm.plot1d(fill = 1, level = 0.66)
+    #     [i.set_color('b') for i in ho['pdf']]
+    #     ho['pdf'][0].set_label('pdf found by online EM')
+
+    #     P.legend(loc = 0)
+    # else:
+    #     # Draw what is happening
+    #     P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
+
+    #     # Real confidence ellipses
+    #     Xre, Yre  = gm.conf_ellipses()
+    #     P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
+    #     for i in range(1,k):
+    #         P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+
+    #     # Initial confidence ellipses as found by kmean
+    #     X0e, Y0e  = gm0.conf_ellipses()
+    #     P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
+    #     for i in range(1,k):
+    #         P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
+
+    #     # Values found by EM
+    #     Xe, Ye  = lgm.conf_ellipses()
+    #     P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
+    #     for i in range(1,k):
+    #         P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
+    #     P.legend(loc = 0)
+
+    #     # Values found by Online EM
+    #     Xe, Ye  = ogm.conf_ellipses()
+    #     P.plot(Xe[0], Ye[0], 'm', label = 'confidence ellipsoides found by Online EM')
+    #     for i in range(1,k):
+    #         P.plot(Xe[i], Ye[i], 'm', label = '_nolegend_')
+    #     P.legend(loc = 0)
+
+
+    # P.subplot(2, 1, 2)
+    # P.plot(like)
+    # P.title('log likelihood')
+
+    # P.show()

Added: trunk/Lib/sandbox/pyem/profile_data/profile_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/profile_data/profile_densities.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/profile_data/profile_densities.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,54 @@
+import numpy as N
+from numpy.random import randn
+from pyem import densities as D
+from pyem import _c_densities as DC
+#import tables
+
+def bench(func, mode = 'diag'):
+    #===========================================
+    # Diag Gaussian of dimension 20
+    #===========================================
+    d       = 30
+    n       = 1e5
+    niter   = 10
+
+    print "Compute %d times densities, %d dimension, %d frames" % (niter, d, n)
+    # Generate a model with k components, d dimensions
+    mu  = randn(1, d)
+    if mode == 'diag':
+        va  = abs(randn(1, d))
+    elif mode == 'full':
+        va  = randn(d, d)
+        va  = N.dot(va, va.transpose())
+
+    X   = randn(n, d)
+    for i in range(niter):
+        Y   = func(X, mu, va)
+
+def benchpy():
+    bench(D.gauss_den)
+
+def benchc():
+    bench(DC.gauss_den)
+
+def benchpyfull():
+    bench(D.gauss_den, 'full')
+
+def benchcfull():
+    bench(DC.gauss_den, 'full')
+
+if __name__ == "__main__":
+    import hotshot, hotshot.stats
+    profile_file    = 'gdenpy.prof'
+    prof    = hotshot.Profile(profile_file, lineevents=1)
+    prof.runcall(benchpy)
+    p = hotshot.stats.load(profile_file)
+    print p.sort_stats('cumulative').print_stats(20)
+    prof.close()
+
+    profile_file    = 'gdenc.prof'
+    prof    = hotshot.Profile(profile_file, lineevents=1)
+    prof.runcall(benchc)
+    p = hotshot.stats.load(profile_file)
+    print p.sort_stats('cumulative').print_stats(20)
+    prof.close()

Added: trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py
===================================================================
--- trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,70 @@
+import numpy as N
+from pyem import GM, GMM
+import copy
+
+from pyem._c_densities import gauss_den
+
+def bench1(mode = 'diag'):
+    #===========================================
+    # GMM of 20 comp, 20 dimension, 1e4 frames
+    #===========================================
+    d       = 15
+    k       = 30
+    nframes = 1e4
+    niter   = 10
+    mode    = 'diag'
+
+    print "============================================================="
+    print "(%d dim, %d components) GMM with %d iterations, for %d frames" \
+            % (d, k, niter, nframes)
+
+    #+++++++++++++++++++++++++++++++++++++++++++
+    # Create an artificial GMM model, samples it
+    #+++++++++++++++++++++++++++++++++++++++++++
+    print "Generating the mixture"
+    # Generate a model with k components, d dimensions
+    w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
+    # gm          = GM(d, k, mode)
+    # gm.set_param(w, mu, va)
+    gm          = GM.fromvalues(w, mu, va)
+
+    # Sample nframes frames  from the model
+    data    = gm.sample(nframes)
+
+    #++++++++++++++++++++++++
+    # Learn the model with EM
+    #++++++++++++++++++++++++
+
+    # Init the model
+    print "Init a model for learning, with kmean for initialization"
+    lgm = GM(d, k, mode)
+    gmm = GMM(lgm, 'kmean')
+    
+    gmm.init(data)
+    # Keep the initialized model for drawing
+    gm0 = copy.copy(lgm)
+
+    # The actual EM, with likelihood computation
+    like    = N.zeros(niter)
+
+    print "computing..."
+    for i in range(niter):
+        print "iteration %d" % i
+        g, tgd  = gmm.sufficient_statistics(data)
+        like[i] = N.sum(N.log(N.sum(tgd, 1)))
+        gmm.update_em(data, g)
+
+if __name__ == "__main__":
+    import hotshot, hotshot.stats
+    profile_file    = 'gmm.prof'
+    prof    = hotshot.Profile(profile_file, lineevents=1)
+    prof.runcall(bench1)
+    p = hotshot.stats.load(profile_file)
+    print p.sort_stats('cumulative').print_stats(20)
+    prof.close()
+    # import profile
+    # profile.run('bench1()', 'gmmprof')
+    # import pstats
+    # p = pstats.Stats('gmmprof')
+    # print p.sort_stats('cumulative').print_stats(20)
+

Deleted: trunk/Lib/sandbox/pyem/pyem/__init__.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/__init__.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/__init__.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,12 +0,0 @@
-#! /usr/bin/env python
-# Last Change: Tue Oct 03 06:00 PM 2006 J
-
-from info import __doc__
-
-from numpy.testing import NumpyTest
-test = NumpyTest().test
-
-version = '0.5.3'
-
-from gauss_mix import GmParamError, GM
-from gmm_em import GmmParamError, GMM, EM

Deleted: trunk/Lib/sandbox/pyem/pyem/_c_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/_c_densities.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/_c_densities.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,217 +0,0 @@
-#! /usr/bin/python
-#
-# Copyrighted David Cournapeau
-# Last Change: Tue Oct 03 05:00 PM 2006 J
-
-# This module uses a C implementation through ctypes, for diagonal cases
-# TODO:
-#   - portable way to find/open the shared library
-#   - full cov matrice
-
-import numpy as N
-import numpy.linalg as lin
-from numpy.random import randn
-from scipy.stats import chi2
-import densities as D
-
-import ctypes
-from ctypes import cdll, c_uint, c_int, c_double, POINTER
-from numpy.ctypeslib import ndpointer, load_library
-
-ctypes_major    = int(ctypes.__version__.split('.')[0])
-if ctypes_major < 1:
-    msg =  "version of ctypes is %s, expected at least %s" \
-            % (ctypes.__version__, '1.0.0')
-    raise Exception(msg)
-
-# Requirements for diag gden
-_gden   = load_library('c_gden.so', __file__)
-arg1    = ndpointer(dtype='<f8')
-arg2    = c_uint
-arg3    = c_uint
-arg4    = ndpointer(dtype='<f8')
-arg5    = ndpointer(dtype='<f8')
-arg6    = ndpointer(dtype='<f8')
-_gden.gden_diag.argtypes    = [arg1, arg2, arg3, arg4, arg5, arg6]
-_gden.gden_diag.restype     = c_int
-
-# Error classes
-class DenError(Exception):
-    """Base class for exceptions in this module.
-    
-    Attributes:
-        expression -- input expression in which the error occurred
-        message -- explanation of the error"""
-    def __init__(self, message):
-        self.message    = message
-    
-    def __str__(self):
-        return self.message
-
-# The following function do all the fancy stuff to check that parameters
-# are Ok, and call the right implementation if args are OK.
-def gauss_den(x, mu, va, log = False):
-    """ Compute multivariate Gaussian density at points x for 
-    mean mu and variance va.
-    
-    Vector are row vectors, except va which can be a matrix
-    (row vector variance for diagonal variance)
-    
-    If log is True, than the log density is returned 
-    (useful for underflow ?)"""
-    mu  = N.atleast_2d(mu)
-    va  = N.atleast_2d(va)
-    x   = N.atleast_2d(x)
-    
-    #=======================#
-    # Checking parameters   #
-    #=======================#
-    if len(N.shape(mu)) != 2:
-        raise DenError("mu is not rank 2")
-        
-    if len(N.shape(va)) != 2:
-        raise DenError("va is not rank 2")
-        
-    if len(N.shape(x)) != 2:
-        raise DenError("x is not rank 2")
-        
-    (n, d)      = x.shape
-    (dm0, dm1)  = mu.shape
-    (dv0, dv1)  = va.shape
-    
-    # Check x and mu same dimension
-    if dm0 != 1:
-        msg = "mean must be a row vector!"
-        raise DenError(msg)
-    if dm1 != d:
-        msg = "x and mu not same dim"
-        raise DenError(msg)
-    # Check va and mu same size
-    if dv1 != d:
-        msg = "mu and va not same dim"
-        raise DenError(msg)
-    if dv0 != 1 and dv0 != d:
-        msg = "va not square"
-        raise DenError(msg)
-
-    #===============#
-    # Computation   #
-    #===============#
-    if d == 1:
-        # scalar case
-        return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log)
-    elif dv0 == 1:
-        # Diagonal matrix case
-        return _diag_gauss_den(x, mu, va, log)
-    elif dv1 == dv0:
-        # full case
-        return  _full_gauss_den(x, mu, va, log)
-    else:
-        raise DenError("variance mode not recognized, this is a bug")
-
-# Those 3 functions do almost all the actual computation
-def _scalar_gauss_den(x, mu, va, log):
-    """ This function is the actual implementation
-    of gaussian pdf in scalar case. It assumes all args
-    are conformant, so it should not be used directly
-    
-    ** Expect centered data (ie with mean removed) **
-
-    Call gauss_den instead"""
-    d       = mu.size
-    inva    = 1/va
-    fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
-    y       = ((x-mu) ** 2) * -0.5 * inva
-    if not log:
-        y   = fac * N.exp(y)
-    else:
-        y   = y + log(fac)
-
-    return y
-    
-def _diag_gauss_den(x, mu, va, log):
-    """ This function is the actual implementation
-    of gaussian pdf in scalar case. It assumes all args
-    are conformant, so it should not be used directly
-    
-    ** Expect centered data (ie with mean removed) **
-
-    Call gauss_den instead"""
-    # Diagonal matrix case
-    d   = mu.size
-    n   = x.shape[0]
-    if not log:
-        y       = N.zeros(n)
-        vat     = va.copy()
-        # _gden.gden_diag(N.require(x, requirements = 'C'), n, d, 
-        #         N.require(mu, requirements = 'C'),
-        #         N.require(inva, requirements = 'C'),
-        #         N.require(y, requirements = 'C'))
-        x       = N.require(x, requirements = 'C')
-        mu      = N.require(mu, requirements = 'C')
-        vat     = N.require(vat, requirements = 'C')
-        y       = N.require(y, requirements = 'C')
-        _gden.gden_diag(x, n, d, mu, vat, y)
-        return y
-        # _gden.gden_diag.restype     = c_int
-        # _gden.gden_diag.argtypes    = [POINTER(c_double), c_uint, c_uint,
-        #         POINTER(c_double), POINTER(c_double), POINTER(c_double)]
-
-        # y   = N.zeros(n)
-        # inva= 1/va
-        # _gden.gden_diag(x.ctypes.data_as(POINTER(c_double)),
-        #     n, d,
-        #     mu.ctypes.data_as(POINTER(c_double)),
-        #     inva.ctypes.data_as(POINTER(c_double)),
-        #     y.ctypes.data_as(POINTER(c_double)))
-    else:
-        y   = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
-        for i in range(1, d):
-            y    +=  _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
-        return y
-
-def _full_gauss_den(x, mu, va, log):
-    """ This function is the actual implementation
-    of gaussian pdf in full matrix case. 
-    
-    It assumes all args are conformant, so it should 
-    not be used directly Call gauss_den instead
-    
-    ** Expect centered data (ie with mean removed) **
-
-    Does not check if va is definite positive (on inversible 
-    for that matter), so the inverse computation and/or determinant
-    would throw an exception."""
-    d       = mu.size
-    inva    = lin.inv(va)
-    fac     = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
-
-    # we are using a trick with sum to "emulate" 
-    # the matrix multiplication inva * x without any explicit loop
-    y   = N.dot((x-mu), inva)
-    y   = -0.5 * N.sum(y * (x-mu), 1)
-
-    if not log:
-        y   = fac * N.exp(y)
-    else:
-        y   = y + N.log(fac)
- 
-    return y
-
-if __name__ == "__main__":
-    #=========================================
-    # Test accuracy between pure and C python
-    #=========================================
-    mu  = N.array([2.0, 3])
-    va  = N.array([5.0, 3])
-
-    # Generate a multivariate gaussian of mean mu and covariance va
-    nframes = 1e4
-    X       = randn(nframes, 2)
-    Yc      = N.dot(N.diag(N.sqrt(va)), X.transpose())
-    Yc      = Yc.transpose() + mu
-
-    Y   = D.gauss_den(Yc, mu, va)
-    Yt  = gauss_den(Yc, mu, va)
-
-    print "Diff is " + str(N.sqrt(N.sum((Y-Yt) ** 2))/nframes/2)

Deleted: trunk/Lib/sandbox/pyem/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/densities.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/densities.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,257 +0,0 @@
-#! /usr/bin/python
-#
-# Copyrighted David Cournapeau
-# Last Change: Thu Oct 05 07:00 PM 2006 J
-
-import numpy as N
-import numpy.linalg as lin
-from numpy.random import randn
-from scipy.stats import chi2
-
-# Error classes
-class DenError(Exception):
-    """Base class for exceptions in this module.
-    
-    Attributes:
-        expression -- input expression in which the error occurred
-        message -- explanation of the error"""
-    def __init__(self, message):
-        self.message    = message
-    
-    def __str__(self):
-        return self.message
-
-# The following function do all the fancy stuff to check that parameters
-# are Ok, and call the right implementation if args are OK.
-def gauss_den(x, mu, va, log = False):
-    """ Compute multivariate Gaussian density at points x for 
-    mean mu and variance va.
-    
-    Vector are row vectors, except va which can be a matrix
-    (row vector variance for diagonal variance)
-    
-    If log is True, than the log density is returned 
-    (useful for underflow ?)"""
-    mu  = N.atleast_2d(mu)
-    va  = N.atleast_2d(va)
-    x   = N.atleast_2d(x)
-    
-    #=======================#
-    # Checking parameters   #
-    #=======================#
-    if len(N.shape(mu)) != 2:
-        raise DenError("mu is not rank 2")
-        
-    if len(N.shape(va)) != 2:
-        raise DenError("va is not rank 2")
-        
-    if len(N.shape(x)) != 2:
-        raise DenError("x is not rank 2")
-        
-    (n, d)      = x.shape
-    (dm0, dm1)  = mu.shape
-    (dv0, dv1)  = va.shape
-    
-    # Check x and mu same dimension
-    if dm0 != 1:
-        msg = "mean must be a row vector!"
-        raise DenError(msg)
-    if dm1 != d:
-        msg = "x and mu not same dim"
-        raise DenError(msg)
-    # Check va and mu same size
-    if dv1 != d:
-        msg = "mu and va not same dim"
-        raise DenError(msg)
-    if dv0 != 1 and dv0 != d:
-        msg = "va not square"
-        raise DenError(msg)
-
-    #===============#
-    # Computation   #
-    #===============#
-    if d == 1:
-        # scalar case
-        return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log)
-    elif dv0 == 1:
-        # Diagonal matrix case
-        return _diag_gauss_den(x, mu, va, log)
-    elif dv1 == dv0:
-        # full case
-        return  _full_gauss_den(x, mu, va, log)
-    else:
-        raise DenError("variance mode not recognized, this is a bug")
-
-# Those 3 functions do almost all the actual computation
-def _scalar_gauss_den(x, mu, va, log):
-    """ This function is the actual implementation
-    of gaussian pdf in scalar case. It assumes all args
-    are conformant, so it should not be used directly
-    
-    Call gauss_den instead"""
-    d       = mu.size
-    inva    = 1/va
-    fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
-    y       = ((x-mu) ** 2) * -0.5 * inva
-    if not log:
-        y   = fac * N.exp(y)
-    else:
-        y   = y + log(fac)
-
-    return y
-    
-#from ctypes import cdll, c_uint, c_int, c_double, POINTER
-#_gden   = cdll.LoadLibrary('src/libgden.so')
-#_gden.gden_diag.restype     = c_int
-#_gden.gden_diag.argtypes    = [POINTER(c_double), c_uint, c_uint,
-#        POINTER(c_double), POINTER(c_double), POINTER(c_double)]
-
-def _diag_gauss_den(x, mu, va, log):
-    """ This function is the actual implementation
-    of gaussian pdf in scalar case. It assumes all args
-    are conformant, so it should not be used directly
-    
-    Call gauss_den instead"""
-    # Diagonal matrix case
-    d   = mu.size
-    n   = x.shape[0]
-    if not log:
-        inva    = 1/va[0,0]
-        fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
-        y       =  (x[:,0] - mu[0,0]) ** 2 * inva * -0.5
-        for i in range(1, d):
-            inva    = 1/va[0,i]
-            fac     *= N.sqrt(inva)
-            y       += (x[:,i] - mu[0,i]) ** 2 * inva * -0.5
-        y   = fac * N.exp(y)
-    else:
-        y   = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
-        for i in range(1, d):
-            y    +=  _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
-    return y
-
-def _full_gauss_den(x, mu, va, log):
-    """ This function is the actual implementation
-    of gaussian pdf in full matrix case. 
-    
-    It assumes all args are conformant, so it should 
-    not be used directly Call gauss_den instead
-    
-    Does not check if va is definite positive (on inversible 
-    for that matter), so the inverse computation and/or determinant
-    would throw an exception."""
-    d       = mu.size
-    inva    = lin.inv(va)
-    fac     = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
-
-    # # Slow version
-    # n       = N.size(x, 0)
-    # y       = N.zeros(n)
-    # for i in range(n):
-    #     y[i] = N.dot(x[i,:],
-    #              N.dot(inva, N.transpose(x[i,:])))
-    # y *= -0.5
-
-    # we are using a trick with sum to "emulate" 
-    # the matrix multiplication inva * x without any explicit loop
-    y   = N.dot((x-mu), inva)
-    y   = -0.5 * N.sum(y * (x-mu), 1)
-
-    if not log:
-        y   = fac * N.exp(y)
-    else:
-        y   = y + N.log(fac)
- 
-    return y
-
-# To plot a confidence ellipse from multi-variate gaussian pdf
-def gauss_ell(mu, va, dim = [0, 1], npoints = 100, level = 0.39):
-    """ Given a mean and covariance for multi-variate
-    gaussian, returns npoints points for the ellipse
-    of confidence given by level (all points will be inside
-    the ellipsoides with a probability equal to level)
-    
-    Returns the coordinate x and y of the ellipse"""
-    
-    mu      = N.atleast_1d(mu)
-    va      = N.atleast_1d(va)
-    c       = N.array(dim)
-
-    if mu.size == va.size:
-        mode    = 'diag'
-    else:
-        if va.ndim == 2:
-            if va.shape[0] == va.shape[1]:
-                mode    = 'full'
-            else:
-                raise DenError("variance not square")
-        else:
-            raise DenError("mean and variance are not dim conformant")
-
-    chi22d  = chi2(2)
-    mahal   = N.sqrt(chi22d.ppf(level))
-    
-    # Generates a circle of npoints
-    theta   = N.linspace(0, 2 * N.pi, npoints)
-    circle  = mahal * N.array([N.cos(theta), N.sin(theta)])
-
-    # Get the dimension which we are interested in:
-    mu  = mu[dim]
-    if mode == 'diag':
-        va      = va[dim]
-        elps    = N.outer(mu, N.ones(npoints))
-        elps    += N.dot(N.diag(N.sqrt(va)), circle)
-    elif mode == 'full':
-        va  = va[c,:][:,c]
-        # Method: compute the cholesky decomp of each cov matrix, that is
-        # compute cova such as va = cova * cova' 
-        # WARN: scipy is different than matlab here, as scipy computes a lower
-        # triangular cholesky decomp: 
-        #   - va = cova * cova' (scipy)
-        #   - va = cova' * cova (matlab)
-        # So take care when comparing results with matlab !
-        cova    = lin.cholesky(va)
-        elps    = N.outer(mu, N.ones(npoints))
-        elps    += N.dot(cova, circle)
-    else:
-        raise DenParam("var mode not recognized")
-
-    return elps[0, :], elps[1, :]
-
-if __name__ == "__main__":
-    import pylab
-
-    #=========================================
-    # Test plotting a simple diag 2d variance:
-    #=========================================
-    va  = N.array([5, 3])
-    mu  = N.array([2, 3])
-
-    # Generate a multivariate gaussian of mean mu and covariance va
-    X       = randn(1e3, 2)
-    Yc      = N.dot(N.diag(N.sqrt(va)), X.transpose())
-    Yc      = Yc.transpose() + mu
-
-    # Plotting
-    Xe, Ye  = gauss_ell(mu, va, npoints = 100)
-    pylab.figure()
-    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
-    pylab.plot(Xe, Ye, 'r')
-
-    #=========================================
-    # Test plotting a simple full 2d variance:
-    #=========================================
-    va  = N.array([[0.2, 0.1],[0.1, 0.5]])
-    mu  = N.array([0, 3])
-
-    # Generate a multivariate gaussian of mean mu and covariance va
-    X       = randn(1e3, 2)
-    Yc      = N.dot(lin.cholesky(va), X.transpose())
-    Yc      = Yc.transpose() + mu
-
-    # Plotting
-    Xe, Ye  = gauss_ell(mu, va, npoints = 100, level=0.95)
-    pylab.figure()
-    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
-    pylab.plot(Xe, Ye, 'r')
-    pylab.show()

Deleted: trunk/Lib/sandbox/pyem/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gauss_mix.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/gauss_mix.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,438 +0,0 @@
-# /usr/bin/python
-# Last Change: Tue Oct 03 06:00 PM 2006 J
-
-# Module to implement GaussianMixture class.
-
-import numpy as N
-from numpy.random import randn, rand
-import numpy.linalg as lin
-import densities
-
-MAX_DEV     = 1e-10
-MAX_COND    = 1e10
-
-# Right now, two main usages of a Gaussian Model are possible
-#   - init a Gaussian Model with meta-parameters, and trains it
-#   - set-up a Gaussian Model to sample it, draw ellipsoides 
-#   of confidences. In this case, we would like to init it with
-#   known values of parameters. This can be done with the class method 
-#   fromval
-#
-#   For now, we have to init with meta-parameters, and set 
-#   the parameters afterward. There should be a better way ?
-
-# TODO:
-#   - change bounds methods of GM class instanciations so that it cannot 
-#   be used as long as w, mu and va are not set
-#   - We have to use scipy now for chisquare pdf, so there may be other
-#   methods to be used, ie for implementing random index.
-#   - there is no check on internal state of the GM, that is does w, mu and va values
-#   make sense (eg singular values)
-#   - plot1d is still very rhough. There should be a sensible way to 
-#   modify the result plot (maybe returns a dic with global pdf, component pdf and
-#   fill matplotlib handles). Should be coherent with plot
-class GmParamError:
-    """Exception raised for errors in gmm params
-
-    Attributes:
-        expression -- input expression in which the error occurred
-        message -- explanation of the error
-    """
-    def __init__(self, message):
-        self.message    = message
-    
-    def __str__(self):
-        return self.message
-
-class GM:
-    """Gaussian Mixture class. This is a simple container class
-    to hold Gaussian Mixture parameters (weights, mean, etc...).
-    It can also draw itself (confidence ellipses) and samples itself.
-
-    Is initiated by giving dimension, number of components and 
-    covariance mode"""
-
-    # I am not sure it is useful to have a spherical mode...
-    _cov_mod    = ['diag', 'full']
-
-    def __init__(self, d, k, mode = 'diag'):
-        """Init a Gaussian model of k components, each component being a 
-        d multi-variate Gaussian, with covariance matrix of style mode"""
-        if mode not in self._cov_mod:
-            raise GmParamError("mode %s not recognized" + str(mode))
-
-        self.d      = d
-        self.k      = k
-        self.mode   = mode
-
-        # Init to 0 all parameters, with the right dimensions.
-        # Not sure this is useful in python from an efficiency POV ?
-        self.w   = N.zeros(k)
-        self.mu  = N.zeros((k, d))
-        if mode == 'diag':
-            self.va  = N.zeros((k, d))
-        elif mode == 'full':
-            self.va  = N.zeros((k * d, d))
-
-        self.is_valid   = False
-
-    def set_param(self, weights, mu, sigma):
-        """Set parameters of the model. Args should
-        be conformant with metparameters d and k given during
-        initialisation"""
-        k, d, mode  = check_gmm_param(weights, mu, sigma)
-        if not k == self.k:
-            raise GmParamError("Number of given components is %d, expected %d" 
-                    % (shape(k), shape(self.k)))
-        if not d == self.d:
-            raise GmParamError("Dimension of the given model is %d, expected %d" 
-                    % (shape(d), shape(self.d)))
-        if not mode == self.mode and not d == 1:
-            raise GmParamError("Given covariance mode is %s, expected %s"
-                    % (mode, self.mode))
-        self.w  = weights
-        self.mu = mu
-        self.va = sigma
-
-        self.is_valid   = True
-
-    @classmethod
-    def fromvalues(cls, weights, mu, sigma):
-        """This class method can be used to create a GM model
-        directly from its parameters weights, mean and variance
-        
-        w, mu, va   = GM.gen_param(d, k)
-        gm  = GM(d, k)
-        gm.set_param(w, mu, va)
-
-        and
-        
-        w, mu, va   = GM.gen_param(d, k)
-        gm  = GM.fromvalue(w, mu, va)
-
-        Are equivalent """
-        k, d, mode  = check_gmm_param(weights, mu, sigma)
-        res = cls(d, k, mode)
-        res.set_param(weights, mu, sigma)
-        return res
-        
-    def sample(self, nframes):
-        """ Sample nframes frames from the model """
-        if not self.is_valid:
-            raise GmParamError("""Parameters of the model has not been 
-                set yet, please set them using self.set_param()""")
-
-        # State index (ie hidden var)
-        S   = gen_rand_index(self.w, nframes)
-        # standard gaussian
-        X   = randn(nframes, self.d)        
-
-        if self.mode == 'diag':
-            X   = self.mu[S, :]  + X * N.sqrt(self.va[S,:])
-        elif self.mode == 'full':
-            # Faster:
-            cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1]))
-            for i in range(self.k):
-                # Using cholesky looks more stable than sqrtm; sqrtm is not
-                # available in numpy anyway, only in scipy...
-                cho[i]  = lin.cholesky(self.va[i*self.d:i*self.d+self.d,:])
-
-            for s in range(self.k):
-                tmpind      = N.where(S == s)[0]
-                X[tmpind]   = N.dot(X[tmpind], cho[s].transpose()) + self.mu[s]
-        else:
-            raise GmParamError('cov matrix mode not recognized, this is a bug !')
-
-        return X
-
-    def conf_ellipses(self, *args, **kargs):
-        """Returns a list of confidence ellipsoids describing the Gmm
-        defined by mu and va. Check densities.gauss_ell for details
-
-        Returns:
-            -Xe:    a list of x coordinates for the ellipses (Xe[i] is
-            the array containing x coordinates of the ith Gaussian)
-            -Ye:    a list of y coordinates for the ellipses
-
-        Example:
-            Suppose we have w, mu and va as parameters for a mixture, then:
-            
-            gm      = GM(d, k)
-            gm.set_param(w, mu, va)
-            X       = gm.sample(1000)
-            Xe, Ye  = gm.conf_ellipsoids()
-            pylab.plot(X[:,0], X[:, 1], '.')
-            for k in len(w):
-                pylab.plot(Xe[k], Ye[k], 'r')
-                
-            Will plot samples X draw from the mixture model, and
-            plot the ellipses of equi-probability from the mean with
-            fixed level of confidence 0.39.  """
-        if not self.is_valid:
-            raise GmParamError("""Parameters of the model has not been 
-                set yet, please set them using self.set_param()""")
-
-        Xe  = []
-        Ye  = []   
-        if self.mode == 'diag':
-            for i in range(self.k):
-                xe, ye  = densities.gauss_ell(self.mu[i,:], self.va[i,:], 
-                        *args, **kargs)
-                Xe.append(xe)
-                Ye.append(ye)
-        elif self.mode == 'full':
-            for i in range(self.k):
-                xe, ye  = densities.gauss_ell(self.mu[i,:], 
-                        self.va[i*self.d:i*self.d+self.d,:], 
-                        *args, **kargs)
-                Xe.append(xe)
-                Ye.append(ye)
-
-        return Xe, Ye
-    
-    def check_state(self):
-        """
-        """
-        if not self.is_valid:
-            raise GmParamError("""Parameters of the model has not been 
-                set yet, please set them using self.set_param()""")
-
-        if self.mode == 'full':
-            raise NotImplementedError, "not implemented for full mode yet"
-        
-        # # How to check w: if one component is negligeable, what shall
-        # # we do ?
-        # M   = N.max(self.w)
-        # m   = N.min(self.w)
-
-        # maxc    = m / M
-
-        # Check condition number for cov matrix
-        cond    = N.zeros(self.k)
-        ava     = N.absolute(self.va)
-        for c in range(self.k):
-            cond[c] = N.amax(ava[c,:]) / N.amin(ava[c,:])
-
-        print cond
-
-    def gen_param(self, d, nc, varmode = 'diag', spread = 1):
-        """Generate valid parameters for a gaussian mixture model.
-        d is the dimension, nc the number of components, and varmode
-        the mode for cov matrices.
-
-        This is a class method.
-
-        Returns: w, mu, va
-        """
-        w   = abs(randn(nc))
-        w   = w / sum(w, 0)
-
-        mu  = spread * randn(nc, d)
-        if varmode == 'diag':
-            va  = abs(randn(nc, d))
-        elif varmode == 'full':
-            va  = randn(nc * d, d)
-            for k in range(nc):
-                va[k*d:k*d+d]   = N.dot( va[k*d:k*d+d], 
-                    va[k*d:k*d+d].transpose())
-        else:
-            raise GmParamError('cov matrix mode not recognized')
-
-        return w, mu, va
-
-    gen_param = classmethod(gen_param)
-
-    def plot(self, *args, **kargs):
-        """Plot the ellipsoides directly for the model
-        
-        Returns a list of lines, so that their style can be modified. By default,
-        the style is red color, and nolegend for all of them.
-        
-        Does not work for 1d"""
-        if not self.is_valid:
-            raise GmParamError("""Parameters of the model has not been 
-                set yet, please set them using self.set_param()""")
-
-        k       = self.k
-        Xe, Ye  = self.conf_ellipses(*args, **kargs)
-        try:
-            import pylab as P
-            return [P.plot(Xe[i], Ye[i], 'r', label='_nolegend_')[0] for i in range(k)]
-            #for i in range(k):
-            #    P.plot(Xe[i], Ye[i], 'r')
-        except ImportError:
-            raise GmParamError("matplotlib not found, cannot plot...")
-
-    def plot1d(self, level = 0.5, fill = 0, gpdf = 0):
-        """This function plots the pdfs of each component of the model. 
-        If gpdf is 1, also plots the global pdf. If fill is 1, fill confidence
-        areas using level argument as a level value
-        
-        Returns a dictionary h of plot handles so that their properties can
-        be modified (eg color, label, etc...):
-            - h['pdf'] is a list of lines, one line per component pdf
-            - h['gpdf'] is the line for the global pdf
-            - h['conf'] is a list of filling area
-        """
-        # This is not optimized at all, may be slow. Should not be
-        # difficult to make much faster, but it is late, and I am lazy
-        if not self.d == 1:
-            raise GmParamError("the model is not one dimensional model")
-        from scipy.stats import norm
-        nrm     = norm(0, 1)
-        pval    = N.sqrt(self.va[:,0]) * nrm.ppf((1+level)/2)
-
-        # Compute reasonable min/max for the normal pdf
-        mc  = 3
-        std = N.sqrt(self.va[:,0])
-        m   = N.amin(self.mu[:, 0] - mc * std)
-        M   = N.amax(self.mu[:, 0] + mc * std)
-
-        np  = 500
-        x   = N.linspace(m, M, np)
-        Yf  = N.zeros(np)
-        Yt  = N.zeros(np)
-
-        # Prepare the dic of plot handles to return
-        ks  = ['pdf', 'conf', 'gpdf']
-        hp  = dict((i,[]) for i in ks)
-        try:
-            import pylab as P
-            for c in range(self.k):
-                y   = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
-                        N.exp(-(x-self.mu[c][0])**2/(2*std[c]**2))
-                Yt  += y
-                h   = P.plot(x, y, 'r', label ='_nolegend_')
-                hp['pdf'].extend(h)
-                if fill:
-                    #P.axvspan(-pval[c] + self.mu[c][0], pval[c] + self.mu[c][0], 
-                    #        facecolor = 'b', alpha = 0.2)
-                    id1 = -pval[c] + self.mu[c]
-                    id2 = pval[c] + self.mu[c]
-                    xc  = x[:, N.where(x>id1)[0]]
-                    xc  = xc[:, N.where(xc<id2)[0]]
-                    Yf  = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
-                            N.exp(-(xc-self.mu[c][0])**2/(2*std[c]**2))
-                    xc  = N.concatenate(([xc[0]], xc, [xc[-1]]))
-                    Yf  = N.concatenate(([0], Yf, [0]))
-                    h   = P.fill(xc, Yf, 
-                            facecolor = 'b', alpha = 0.1, label='_nolegend_')
-                    hp['conf'].extend(h)
-                    #P.fill([xc[0], xc[0], xc[-1], xc[-1]], 
-                    #        [0, Yf[0], Yf[-1], 0], facecolor = 'b', alpha = 0.2)
-            if gpdf:
-                h           = P.plot(x, Yt, 'r:', label='_nolegend_')
-                hp['gpdf']  = h
-            return hp
-        except ImportError:
-            raise GmParamError("matplotlib not found, cannot plot...")
-
-# Function to generate a random index: this is kept outside any class,
-# as the function can be useful for other
-def gen_rand_index(p, n):
-    """Generate a N samples vector containing random index between 1 
-    and length(p), each index i with probability p(i)"""
-    # TODO Check args here
-    
-    # TODO: check each value of inverse distribution is
-    # different
-    invcdf  = N.cumsum(p)
-    uni     = rand(n)
-    index   = N.zeros(n, dtype=int)
-
-    # This one should be a bit faster
-    for k in range(len(p)-1, 0, -1):
-        blop        = N.where(N.logical_and(invcdf[k-1] <= uni, 
-                    uni < invcdf[k]))
-        index[blop] = k
-        
-    return index
-
-def check_gmm_param(w, mu, va):
-    """Check that w, mu and va are valid parameters for
-    a mixture of gaussian: w should sum to 1, there should
-    be the same number of component in each param, the variances
-    should be positive definite, etc... 
-    
-    Params:
-        w   = vector or list of weigths of the mixture (K elements)
-        mu  = matrix: K * d
-        va  = list of variances (vector K * d or square matrices Kd * d)
-
-    returns:
-        K   = number of components
-        d   = dimension
-        mode    = 'diag' if diagonal covariance, 'full' of full matrices
-    """
-        
-    # Check that w is valid
-    if N.fabs(N.sum(w, 0)  - 1) > MAX_DEV:
-        raise GmParamError('weight does not sum to 1')
-    
-    if not len(w.shape) == 1:
-        raise GmParamError('weight is not a vector')
-
-    # Check that mean and va have the same number of components
-    K           = len(w)
-
-    if N.ndim(mu) < 2:
-        msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
-        raise GmParamError(msg)
-    if N.ndim(va) < 2:
-        msg = """va should be a K,d / K *d, d matrix, and a row vector if
-        only 1 diag comp"""
-        raise GmParamError(msg)
-
-    (Km, d)     = mu.shape
-    (Ka, da)    = va.shape
-
-    if not K == Km:
-        msg = "not same number of component in mean and weights"
-        raise GmParamError(msg)
-
-    if not d == da:
-        msg = "not same number of dimensions in mean and variances"
-        raise GmParamError(msg)
-
-    if Km == Ka:
-        mode = 'diag'
-    else:
-        mode = 'full'
-        if not Ka == Km*d:
-            msg = "not same number of dimensions in mean and variances"
-            raise GmParamError(msg)
-        
-    return K, d, mode
-        
-if __name__ == '__main__':
-    # Meta parameters:
-    #   - k = number of components
-    #   - d = dimension
-    #   - mode : mode of covariance matrices
-    d       = 5
-    k       = 4
-
-    # Now, drawing a model
-    mode    = 'full'
-    nframes = 1e3
-
-    # Build a model with random parameters
-    w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
-    gm          = GM.fromvalues(w, mu, va)
-
-    # Sample nframes frames  from the model
-    X   = gm.sample(nframes)
-
-    # Plot the data
-    import pylab as P
-    P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
-
-    # Real confidence ellipses with confidence level 
-    level       = 0.50
-    h           = gm.plot(level=level)
-
-    # set the first ellipse label, which will appear in the legend
-    h[0].set_label('confidence ell at level ' + str(level))
-
-    P.legend(loc = 0)
-    P.show()

Deleted: trunk/Lib/sandbox/pyem/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gmm_em.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/gmm_em.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,394 +0,0 @@
-# /usr/bin/python
-# Last Change: Fri Oct 06 05:00 PM 2006 J
-
-# TODO:
-#   - which methods to avoid va shrinking to 0 ? There are several options, 
-#   not sure which ones are appropriates
-#   - improve EM trainer
-#   - online EM
-
-import numpy as N
-import numpy.linalg as lin
-from numpy.random import randn
-#import _c_densities as densities
-import densities
-from kmean import kmean
-from gauss_mix import GM
-
-# Error classes
-class GmmError(Exception):
-    """Base class for exceptions in this module."""
-    pass
-
-class GmmParamError:
-    """Exception raised for errors in gmm params
-
-    Attributes:
-        expression -- input expression in which the error occurred
-        message -- explanation of the error
-    """
-    def __init__(self, message):
-        self.message    = message
-    
-    def __str__(self):
-        return self.message
-
-# Not sure yet about how to design different mixture models. Most of the code 
-# is different # (pdf, update part of EM, etc...) and I am not sure it makes 
-# sense to use inheritance for # interface specification in python, since its 
-# dynamic type systeme.
-
-# Anyway, a mixture class should encapsulates all details concerning a mixture model:
-#   - internal parameters for the pdfs
-#   - can compute sufficient statistics for EM
-#   - can sample a model
-#   - can generate random valid parameters for a new pdf (using class method)
-class MixtureModel:
-    pass
-
-class ExpMixtureModel(MixtureModel):
-    """Class to model mixture of exponential pdf (eg Gaussian, exponential, Laplace, 
-    etc..). This is a special case because some parts of EM are common for those
-    models..."""
-    pass
-
-class GMM(ExpMixtureModel):
-    """ A class to model a Gaussian Mixture Model (GMM). An instance of 
-    this class is created by giving weights, mean and variances in the ctor.
-    An instanciated object can be sampled, trained by EM. 
-    
-    The class method gen_model can be used without instanciation."""
-
-    def init_kmean(self, data, niter = 5):
-        """ Init the model with kmean."""
-        k       = self.gm.k
-        d       = self.gm.d
-        init    = data[0:k, :]
-
-        (code, label)   = kmean(data, init, niter)
-
-        w   = N.ones(k) / k
-        mu  = code.copy()
-        if self.gm.mode == 'diag':
-            va = N.zeros((k, d))
-            for i in range(k):
-                for j in range(d):
-                    va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
-        elif self.gm.mode == 'full':
-            va  = N.zeros((k*d, d))
-            for i in range(k):
-                va[i*d:i*d+d,:] = \
-                    N.cov(data[N.where(label==i)], rowvar = 0)
-        else:
-            raise GmmParamError("mode " + str(mode) + " not recognized")
-
-        self.gm.set_param(w, mu, va)
-
-    def init_random(self, data):
-        """ Init the model at random."""
-        k   = self.gm.k
-        d   = self.gm.d
-        if mode == 'diag':
-            w   = N.ones(k) / k
-            mu  = randn(k, d)
-            va  = N.fabs(randn(k, d))
-        else:
-            raise GmmParamError("""init_random not implemented for
-                    mode %s yet""", mode)
-
-        self.gm.set_param(w, mu, va)
-
-    # TODO: 
-    #   - format of parameters ? For variances, list of variances matrix,
-    #   keep the current format, have 3d matrices ?
-    #   - To handle the different modes, we could do something "fancy" such as
-    #   replacing methods, to avoid checking cases everywhere and unconsistency.
-    def __init__(self, gm, init = 'kmean'):
-        """ Initialize a GMM with weight w, mean mu and variances va, and initialization
-        method for training init (kmean by default)"""
-        self.gm = gm
-
-        # Possible init methods
-        init_methods = {'kmean': self.init_kmean, 'random' : self.init_random}
-
-        if init not in init_methods:
-            raise GmmParamError('init method %s not recognized' + str(init))
-
-        self.init   = init_methods[init]
-
-    def sufficient_statistics(self, data):
-        """ Return normalized and non-normalized sufficient statistics
-        from the model.
-        
-        Computes the latent variable distribution (a 
-        posteriori probability) knowing the explicit data 
-        for the Gaussian model (w, mu, var): gamma(t, i) = 
-            P[state = i | observation = data(t); w, mu, va]
-
-        This is basically the E step of EM for GMM."""
-        n   = data.shape[0]
-
-        # compute the gaussian pdf
-        tgd	= multiple_gauss_den(data, self.gm.mu, self.gm.va)
-        # multiply by the weight
-        tgd	*= self.gm.w
-        # Normalize to get a pdf
-        gd	= tgd  / N.sum(tgd, axis=1)[:, N.newaxis]
-
-        return gd, tgd
-
-    def update_em(self, data, gamma):
-        """Computes update of the Gaussian Mixture Model (M step)
-        from the a posteriori pdf, computed by gmm_posterior
-        (E step).
-        """
-        k       = self.gm.k
-        d       = self.gm.d
-        n       = data.shape[0]
-        invn    = 1.0/n
-        mGamma  = N.sum(gamma, axis = 0)
-
-        if self.gm.mode == 'diag':
-            mu      = N.zeros((k, d))
-            va      = N.zeros((k, d))
-            gamma   = gamma.T
-            for c in range(k):
-                x   = N.dot(gamma[c:c+1,:], data)[0,:]
-                xx  = N.dot(gamma[c:c+1,:], data ** 2)[0,:]
-
-                mu[c,:] = x / mGamma[c]
-                va[c,:] = xx  / mGamma[c] - mu[c,:] ** 2
-            w   = invn * mGamma
-
-        elif self.gm.mode == 'full':
-            # In full mode, this is the bottleneck: the triple loop
-            # kills performances. This is pretty straightforward
-            # algebra, so computing it in C should not be too difficult
-            mu  = N.zeros((k, d))
-            va  = N.zeros((k*d, d))
-
-            gamma   = gamma.transpose()
-            for c in range(k):
-                #x   = N.sum(N.outer(gamma[:, c], 
-                #            N.ones((1, d))) * data, axis = 0)
-                x   = N.dot(gamma[c:c+1,:], data)[0,:]
-                xx  = N.zeros((d, d))
-                
-                # This should be much faster than recursing on n...
-                for i in range(d):
-                    for j in range(d):
-                        xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[c,:], axis = 0)
-
-                mu[c,:] = x / mGamma[c]
-                va[c*d:c*d+d,:] = xx  / mGamma[c] - \
-                                    N.outer(mu[c,:], mu[c,:])
-            w   = invn * mGamma
-        else:
-            raise GmmParamError("varmode not recognized")
-
-        self.gm.set_param(w, mu, va)
-
-class EM:
-    """An EM trainer. An EM trainer
-    trains from data, with a model
-    
-    Not really useful yet"""
-    def __init__(self):
-        pass
-    
-    def train(self, data, model, maxiter = 10, thresh = 1e-5):
-        """
-        Train a model using data, and stops when the likelihood fails
-        behind a threshold, or when the number of iterations > niter, 
-        whichever comes first
-
-        Args:
-            - data:     contains the observed features, one row is one frame, ie one 
-            observation of dimension d
-            - model:    object of class Mixture
-            - maxiter:  maximum number of iterations
-
-        The model is trained, and its parameters updated accordingly.
-
-        Returns:
-            likelihood (one value per iteration).
-        """
-
-        # Initialize the data (may do nothing depending on the model)
-        model.init(data)
-
-        # Likelihood is kept
-        like    = N.zeros(maxiter)
-
-        # Em computation, with computation of the likelihood
-        g, tgd      = model.sufficient_statistics(data)
-        like[0]     = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
-        model.update_em(data, g)
-        for i in range(1, maxiter):
-            g, tgd      = model.sufficient_statistics(data)
-            like[i]     = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
-            model.update_em(data, g)
-            if has_em_converged(like[i], like[i-1], thresh):
-                return like[0:i]
-
-        return like
-    
-# Misc functions
-def has_em_converged(like, plike, thresh):
-    """ given likelihood of current iteration like and previous
-    iteration plike, returns true is converged: based on comparison
-    of the slope of the likehood with thresh"""
-    diff    = N.abs(like - plike)
-    avg     = 0.5 * (N.abs(like) + N.abs(plike))
-    if diff / avg < thresh:
-        return True
-    else:
-        return False
-
-def multiple_gauss_den(data, mu, va):
-    """Helper function to generate several Gaussian
-    pdf (different parameters) from the same data"""
-    mu  = N.atleast_2d(mu)
-    va  = N.atleast_2d(va)
-
-    K   = mu.shape[0]
-    n   = data.shape[0]
-    d   = data.shape[1]
-    
-    y   = N.zeros((K, n))
-    if mu.size == va.size:
-        for i in range(K):
-            y[i] = densities.gauss_den(data, mu[i, :], va[i, :])
-        return y.T
-    else:
-        for i in range(K):
-            y[:, i] = densities.gauss_den(data, mu[i, :], 
-                        va[d*i:d*i+d, :])
-
-    return y
-
-if __name__ == "__main__":
-    import copy
-    #=============================
-    # Simple GMM with 5 components
-    #=============================
-
-    #+++++++++++++++++++++++++++++
-    # Meta parameters of the model
-    #   - k: Number of components
-    #   - d: dimension of each Gaussian
-    #   - mode: Mode of covariance matrix: full or diag
-    #   - nframes: number of frames (frame = one data point = one
-    #   row of d elements
-    k       = 2 
-    d       = 1
-    mode    = 'full'
-    nframes = 1e3
-
-    #+++++++++++++++++++++++++++++++++++++++++++
-    # Create an artificial GMM model, samples it
-    #+++++++++++++++++++++++++++++++++++++++++++
-    print "Generating the mixture"
-    # Generate a model with k components, d dimensions
-    w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
-    gm          = GM(d, k, mode)
-    gm.set_param(w, mu, va)
-
-    # Sample nframes frames  from the model
-    data    = gm.sample(nframes)
-
-    #++++++++++++++++++++++++
-    # Learn the model with EM
-    #++++++++++++++++++++++++
-
-    # Init the model
-    print "Init a model for learning, with kmean for initialization"
-    lgm = GM(d, k, mode)
-    gmm = GMM(lgm, 'kmean')
-    gmm.init(data)
-
-    # Keep the initialized model for drawing
-    gm0 = copy.copy(lgm)
-
-    # The actual EM, with likelihood computation
-    niter   = 10
-    like    = N.zeros(niter)
-
-    print "computing..."
-    for i in range(niter):
-        g, tgd  = gmm.sufficient_statistics(data)
-        like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
-        gmm.update_em(data, g)
-    # # Alternative form, by using EM class: as the EM class
-    # # is quite rudimentary now, it is not very useful, just save
-    # # a few lines
-    # em      = EM()
-    # like    = em.train(data, gmm, niter)
-
-    #+++++++++++++++
-    # Draw the model
-    #+++++++++++++++
-    print "drawing..."
-    import pylab as P
-    P.subplot(2, 1, 1)
-
-    if not d == 1:
-        # Draw what is happening
-        P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
-
-        # Real confidence ellipses
-        Xre, Yre  = gm.conf_ellipses()
-        P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
-        for i in range(1,k):
-            P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
-
-        # Initial confidence ellipses as found by kmean
-        X0e, Y0e  = gm0.conf_ellipses()
-        P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
-        for i in range(1,k):
-            P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
-
-        # Values found by EM
-        Xe, Ye  = lgm.conf_ellipses()
-        P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
-        for i in range(1,k):
-            P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
-        P.legend(loc = 0)
-    else:
-        # Real confidence ellipses
-        h   = gm.plot1d()
-        [i.set_color('g') for i in h['pdf']]
-        h['pdf'][0].set_label('true pdf')
-
-        # Initial confidence ellipses as found by kmean
-        h0  = gm0.plot1d()
-        [i.set_color('k') for i in h0['pdf']]
-        h0['pdf'][0].set_label('initial pdf')
-
-        # Values found by EM
-        hl  = lgm.plot1d(fill = 1, level = 0.66)
-        [i.set_color('r') for i in hl['pdf']]
-        hl['pdf'][0].set_label('pdf found by EM')
-
-        P.legend(loc = 0)
-
-    P.subplot(2, 1, 2)
-    P.plot(like)
-    P.title('log likelihood')
-
-    # #++++++++++++++++++
-    # # Export the figure
-    # #++++++++++++++++++
-    # F   = P.gcf()
-    # DPI = F.get_dpi()
-    # DefaultSize = F.get_size_inches()
-    # # the default is 100dpi for savefig:
-    # F.savefig("example1.png")
-
-    # # Now make the image twice as big, while keeping the fonts and all the
-    # # same size
-    # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) )
-    # Size = F.get_size_inches()
-    # print "Size in Inches", Size
-    # F.savefig("example2.png")
-    P.show()

Deleted: trunk/Lib/sandbox/pyem/pyem/info.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/info.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/info.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,35 +0,0 @@
-"""
-Routines for Gaussian Mixture Models
-and learning with Expectation Maximization 
-==========================================
-
-This module contains classes and function to compute multivariate Gaussian densities
-(diagonal and full covariance matrices), Gaussian mixtures, Gaussian mixtures models
-and an Em trainer.
-
-More specifically, the module contains the following file:
-
-- densities.py: functions to compute multivariate Gaussian pdf and ellipsoides of
-confidence (gauss_den)
-- gauss_mix.py: defines the GM (Gaussian Mixture) class. A Gaussian Mixture can be
-created from its parameters weights, mean and variances, or from its meta parameters
-d (dimension of the Gaussian) and k (number of components in the mixture). A Gaussian
-Model can then be sampled or plot (if d>1, plot confidence ellipsoids projected on 
-2 chosen dimensions, if d == 1, plot the pdf of each component and fill the zone
-of confidence for a given level)
-- gmm_em.py: defines a class GMM (Gaussian Mixture Model). This class is constructed
-from a GM model gm, and can be used to train gm. The GMM can be initiated by
-kmean or at random, and can compute sufficient statistics, and update its parameters
-from the sufficient statistics.
-- kmean.py: implements a kmean algorithm. We cannot use scipy.cluster.vq kmeans, since
-its does not give membership of observations.
-
-Example of use: simply execute gmm_em.py for an example of training a GM and plotting
-the results compared to true values
-
-Copyright: David Cournapeau 2006
-License: BSD-style (see LICENSE.txt in main source directory)
-"""
-
-depends = ['linalg', 'stats']
-ignore  = False

Deleted: trunk/Lib/sandbox/pyem/pyem/kmean.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/kmean.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/kmean.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,76 +0,0 @@
-# /usr/bin/python
-# Last Change: Thu Sep 28 01:00 PM 2006 J
-
-#TODO:
-#   - a demo for kmeans
-
-import numpy as N
-
-def _py_vq(data, code):
-    """ Please do not use directly. Use kmean instead"""
-    # No attempt to be efficient has been made...
-    (n, d)  = data.shape
-    (k, d)  = code.shape
-
-    label   = N.zeros(n, int)
-    for i in range(n):
-        d           = N.sum((data[i, :] - code) ** 2, 1)
-        label[i]    = N.argmin(d)
-
-    return label
-    
-# Try to import pyrex function for vector quantization. If not available,
-# falls back on pure python implementation.
-#%KMEANIMPORT%
-#try:
-#    from scipy.cluster.vq import kmeans as kmean
-#except ImportError:
-#    try:
-#        from c_gmm import _vq
-#    except:
-#        print """c_gmm._vq not found, using pure python implementation instead. 
-#        Kmean will be REALLY slow"""
-#        _vq = _py_vq
-try:
-    from scipy.cluster.vq import vq
-    print "using scipy.cluster.vq"
-    def _vq(*args, **kw): return vq(*args, **kw)[0]
-except ImportError:
-    try:
-        from c_gmm import _vq
-        print "using pyrex vq"
-    except ImportError:
-        print """c_gmm._vq not found, using pure python implementation instead. 
-        Kmean will be REALLY slow"""
-        _vq = _py_vq
-
-def kmean(data, init, iter = 10):
-    """Simple kmean implementation for EM. Runs iter iterations.
-    
-    returns a tuple (code, label), where code are the final
-    centroids, and label are the class label indec for each
-    frame (ie row) of data"""
-
-    data    = N.atleast_2d(data)
-    init    = N.atleast_2d(init)
-
-    (n, d)  = data.shape
-    (k, d1) = init.shape
-
-    if not d == d1:
-        msg = "data and init centers do not have same dimensions..."
-        raise GmmParamError(msg)
-    
-    code    = N.asarray(init.copy())
-    for i in range(iter):
-        # Compute the nearest neighbour for each obs
-        # using the current code book
-        label   = _vq(data, code)
-        # Update the code by computing centroids using the new code book
-        for j in range(k):
-            code[j,:] = N.mean(data[N.where(label==j)], axis=0) 
-
-    return code, label
-
-if __name__ == "__main__":
-    pass

Deleted: trunk/Lib/sandbox/pyem/pyem/online_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/online_em.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/online_em.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,369 +0,0 @@
-# /usr/bin/python
-# Last Change: Thu Oct 12 09:00 PM 2006 J
-
-#---------------------------------------------
-# This is not meant to be used yet !!!! I am 
-# not sure how to integrate this stuff inside
-# the package yet. The cases are:
-#   - we have a set of data, and we want to test online EM 
-#   compared to normal EM 
-#   - we do not have all the data before putting them in online EM:
-#   eg current frame depends on previous frame in some way.
-
-import numpy as N
-from numpy import repmat, mean
-from numpy.testing import assert_array_almost_equal, assert_array_equal
-
-from gmm_em import ExpMixtureModel, GMM, EM, multiple_gauss_den
-from gauss_mix import GM
-from kmean import kmean
-
-print "======================================================"
-import traceback
-f = traceback.extract_stack()
-print f[0][0] + " This is beta code, don't use it !        "
-print "======================================================"
-
-# Error classes
-class OnGmmError(Exception):
-    """Base class for exceptions in this module."""
-    pass
-
-class OnGmmParamError:
-    """Exception raised for errors in gmm params
-
-    Attributes:
-        expression -- input expression in which the error occurred
-        message -- explanation of the error
-    """
-    def __init__(self, message):
-        self.message    = message
-    
-    def __str__(self):
-        return self.message
-
-class OnGMM(ExpMixtureModel):
-    def init_kmean(self, init_data, niter = 5):
-        """ Init the model at random."""
-        k   = self.gm.k
-        d   = self.gm.d
-        if mode == 'diag':
-            w           = N.ones(k) / k
-
-            # Init the internal state of EM
-            self.cx     = N.outer(w, mean(init_data, 0))
-            self.cxx    = N.outer(w, mean(init_data ** 2, 0))
-
-            # w, mu and va init is the same that in the standard case
-            (code, label)   = kmean(init_data, init_data[0:k, :], niter)
-            mu          = code.copy()
-            va          = N.zeros((k, d))
-            for i in range(k):
-                for j in range(d):
-                    va [i,j] = N.cov(init_data[N.where(label==i), j], rowvar = 0)
-        else:
-            raise OnGmmParamError("""init_online not implemented for
-                    mode %s yet""", mode)
-
-        self.gm.set_param(w, mu, va)
-        self.cw     = w
-        self.cmu    = mu
-        self.cva    = va
-
-        self.pw     = self.cw.copy()
-        self.pmu    = self.cmu.copy()
-        self.pva    = self.cva.copy()
-
-    def __init__(self, gm, init_data, init = 'kmean'):
-        self.gm = gm
-        
-        # Possible init methods
-        init_methods = {'kmean' : self.init_kmean}
-
-        self.init   = init_methods[init]
-
-    def sufficient_statistics(self, frame, nu):
-        """ sufficient_statistics(frame, nu)
-        
-        frame has to be rank 2 !"""
-        gamma   = multiple_gauss_den(frame, self.pmu, self.pva)[0]
-        gamma   *= self.pw
-        gamma   /= N.sum(gamma)
-        # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
-        self.cw	= (1 - nu) * self.cw + nu * gamma
-
-        return gamma
-
-    def update_em(self, frame, gamma, nu):
-        cx  = self.cx
-        cxx = self.cxx
-        cmu = self.cmu
-        cva = self.cva
-        for k in range(self.gm.k):
-            cx[k]   = (1 - nu) * cx[k] + nu * frame * gamma[k]
-            cxx[k]  = (1 - nu) * cxx[k] + nu * frame ** 2 * gamma[k]
-
-            cmu[k]  = cx[k] / cw[k]
-            cva[k]  = cxx[k] / cw[k] - cmu[k] ** 2
-    
-if __name__ == "__main__":
-    import copy
-    #=============================
-    # Simple GMM with 2 components
-    #=============================
-
-    #+++++++++++++++++++++++++++++
-    # Meta parameters of the model
-    #   - k: Number of components
-    #   - d: dimension of each Gaussian
-    #   - mode: Mode of covariance matrix: full or diag
-    #   - nframes: number of frames (frame = one data point = one
-    #   row of d elements
-    k       = 2 
-    d       = 1
-    mode    = 'diag'
-    nframes = int(1e3)
-    emiter  = 2
-
-    #+++++++++++++++++++++++++++++++++++++++++++
-    # Create an artificial GMM model, samples it
-    #+++++++++++++++++++++++++++++++++++++++++++
-    print "Generating the mixture"
-    # Generate a model with k components, d dimensions
-    w, mu, va   = GM.gen_param(d, k, mode, spread = 1.5)
-    gm          = GM.fromvalues(w, mu, va)
-
-    # Sample nframes frames  from the model
-    data    = gm.sample(nframes, )
-
-    #++++++++++++++++++++++++
-    # Learn the model with EM
-    #++++++++++++++++++++++++
-
-    # Init the model
-    print "Init a model for learning, with kmean for initialization"
-    lgm = GM(d, k, mode)
-    gmm = GMM(lgm, 'kmean')
-    gmm.init(data)
-
-    # Keep the initialized model for drawing
-    gm0 = copy.copy(lgm)
-
-    # The actual EM, with likelihood computation
-    like    = N.zeros(emiter)
-    print "computing..."
-    for i in range(emiter):
-        g, tgd  = gmm.sufficient_statistics(data)
-        like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
-        gmm.update_em(data, g)
-
-    #+++++++++++++++++++++++++++++++
-    # Learn the model with Online EM
-    #+++++++++++++++++++++++++++++++
-    ogm     = GM(d, k, mode)
-    ogmm    = OnGMM(ogm, 'kmean')
-    #init_data   = data[0:nframes / 20, :]
-    init_data   = data
-    ogmm.init(init_data)
-
-    ogm2    = GM(d, k, mode)
-    ogmm2   = OnGMM(ogm2, 'kmean')
-    #init_data   = data[0:nframes / 20, :]
-    init_data   = data
-    ogmm2.init(init_data)
-
-    ogm0    = copy.copy(ogm)
-    assert_array_equal(ogm0.w, gm0.w)
-    assert_array_equal(ogm0.mu, gm0.mu)
-    assert_array_equal(ogm0.va, gm0.va)
-
-    ogm02   = copy.copy(ogm2)
-    assert_array_equal(ogm02.w, gm0.w)
-    assert_array_equal(ogm02.mu, gm0.mu)
-    assert_array_equal(ogm02.va, gm0.va)
-
-    w0  = gm0.w.copy()
-    mu0 = gm0.mu.copy()
-    va0 = gm0.va.copy()
-
-    cx  = ogmm.cx
-    cxx = ogmm.cxx
-    
-    cw  = w0.copy()
-    cmu = mu0.copy()
-    cva = va0.copy()
-    
-    pw  = cw.copy()
-    pmu = cmu.copy()
-    pva = cva.copy()
-
-    # Forgetting param
-    ku		= 0.005
-    t0		= 200
-    lamb	= N.ones((nframes, 1))
-    lamb[0] = 0
-    nu0		= 1.0
-    #lamb	= 1 - 1/(N.arange(-1, nframes-1) * ku + t0)
-    #nu0		= 0.2
-    nu		= N.zeros((len(lamb), 1))
-    nu[0]	= nu0
-    for i in range(1, len(lamb)):
-        nu[i]	= 1./(1 + lamb[i] / nu[i-1])
-
-    gamma   = N.zeros((nframes, k))
-    agamma  = []
-    apw     = []
-    apmu    = []
-    apva    = []
-    print "========== Online Manual ==========="
-    for e in range(emiter):
-        print "online manual: estep %d, printing p* state " % e
-        apw.append(pw.copy())
-        apmu.append(pmu.copy())
-        apva.append(pva.copy())
-        for t in range(nframes):
-            gamma[t]    = multiple_gauss_den(data[t:t+1, :], pmu, pva)[0]
-            gamma[t]    *= pw
-            gamma[t]    /= N.sum(gamma[t])
-            # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
-            # <x>(t) = cx(t)
-            cw	= (1 - nu[t]) * cw + nu[t] * gamma[t]
-            # loop through each component
-            for i in range(k):
-                cx[i]   = (1 - nu[t]) * cx[i] + nu[t] * data[t, :] * gamma[t, i]
-                cxx[i]  = (1 - nu[t]) * cxx[i] + nu[t] * data[t, :] ** 2 * gamma[t, i]
-
-                cmu[i]  = cx[i] / cw[i]
-                cva[i]  = cxx[i] / cw[i] - cmu[i] ** 2
-
-            #pw  = cw.copy()
-            #pmu = cmu.copy()
-            #pva = cva.copy()
-        print "gamma[end]: " + str(gamma[-1])
-        pw  = cw.copy()
-        pmu = cmu.copy()
-        pva = cva.copy()
-        agamma.append(gamma.copy())
-
-    gamma2  = N.zeros((nframes, k))
-    agamma2 = []
-    apw2    = []
-    apmu2   = []
-    apva2   = []
-    print "========== Online Automatic ==========="
-    for e in range(emiter):
-        print "online automatic: estep %d, printing p* state " % e
-        apw2.append(ogmm2.pw.copy())
-        apmu2.append(ogmm2.pmu.copy())
-        apva2.append(ogmm2.pva.copy())
-        for t in range(nframes):
-            gamma2[t]   = ogmm2.sufficient_statistics(data[t:t+1, :], nu[t])
-            #gamma2[t]   = multiple_gauss_den(data[t:t+1, :], ogmm2.pmu, ogmm2.pva)[0]
-            #gamma2[t]   *= ogmm2.pw
-            #gamma2[t]   /= N.sum(gamma2[t])
-            #try:
-            #    assert_array_equal(agamma, gamma2[t])
-            #except AssertionError:
-            #    print "error for estep %d, step %d" % (e, t)
-            #    print ogmm2.pw
-            #    print ogmm2.pmu
-            #    print ogmm2.pva
-            #    raise 
-            ogmm2.update_em(data[t, :], gamma2[t], nu[t])
-            #ogmm2.cw	= (1 - nu[t]) * ogmm2.cw + nu[t] * agamma
-            ## loop through each component
-            #for i in range(k):
-            #    ogmm2.cx[i]   = (1 - nu[t]) * ogmm2.cx[i] + nu[t] * data[t, :] * agamma[i]
-            #    ogmm2.cxx[i]  = (1 - nu[t]) * ogmm2.cxx[i] + nu[t] * data[t, :] ** 2 * agamma[i]
-
-            #    ogmm2.cmu[i]  = ogmm2.cx[i] / ogmm2.cw[i]
-            #    ogmm2.cva[i]  = ogmm2.cxx[i] / ogmm2.cw[i] - ogmm2.cmu[i] ** 2
-
-        print "gamma[end]: " + str(gamma2[-1])
-        agamma2.append(gamma2.copy())
-        ogmm2.pw  = ogmm2.cw.copy()
-        ogmm2.pmu = ogmm2.cmu.copy()
-        ogmm2.pva = ogmm2.cva.copy()
-
-    # #ogm.set_param(pw, pmu, pva)
-    # print "weights off vs on: \n" + str(lgm.w) + "\n " + str(cw)
-    # print "mean off vs on: \n" + str(lgm.mu) + "\n " + str(cmu)
-    # print "variances off vs on: \n" + str(lgm.va) + "\n " + str(cva)
-    # print "weights off vs on2: \n" + str(lgm.w) + "\n " + str(ogmm2.cw)
-    # print "mean off vs on2: \n" + str(lgm.mu) + "\n " + str(ogmm2.cmu)
-    # print "variances off vs on2: \n" + str(lgm.va) + "\n " + str(ogmm2.cva)
-    # assert_array_almost_equal(cw, lgm.w)
-    # assert_array_almost_equal(cmu, lgm.mu)
-    # assert_array_almost_equal(cva, lgm.va)
-    assert_array_equal(ogmm2.pw, pw)
-    assert_array_equal(ogmm2.pmu, pmu)
-    assert_array_equal(ogmm2.pva, pva)
-    assert_array_equal(agamma[0], agamma2[0])
-    #assert_array_almost_equal(ogmm2.cw, lgm.w)
-    #assert_array_almost_equal(ogmm2.cmu, lgm.mu)
-    #assert_array_almost_equal(ogmm2.cva, lgm.va)
-    # #+++++++++++++++
-    # # Draw the model
-    # #+++++++++++++++
-    # print "drawing..."
-    # import pylab as P
-    # P.subplot(2, 1, 1)
-
-    # if d == 1:
-    #     # Real confidence ellipses
-    #     h   = gm.plot1d()
-    #     [i.set_color('g') for i in h['pdf']]
-    #     h['pdf'][0].set_label('true pdf')
-
-    #     # Initial confidence ellipses as found by kmean
-    #     h0  = gm0.plot1d()
-    #     [i.set_color('k') for i in h0['pdf']]
-    #     h0['pdf'][0].set_label('initial pdf')
-
-    #     # Values found by EM
-    #     hl  = lgm.plot1d(fill = 1, level = 0.66)
-    #     [i.set_color('r') for i in hl['pdf']]
-    #     hl['pdf'][0].set_label('pdf found by EM')
-
-    #     # Values found by Online EM
-    #     ho  = ogm.plot1d(fill = 1, level = 0.66)
-    #     [i.set_color('b') for i in ho['pdf']]
-    #     ho['pdf'][0].set_label('pdf found by online EM')
-
-    #     P.legend(loc = 0)
-    # else:
-    #     # Draw what is happening
-    #     P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
-
-    #     # Real confidence ellipses
-    #     Xre, Yre  = gm.conf_ellipses()
-    #     P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
-    #     for i in range(1,k):
-    #         P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
-
-    #     # Initial confidence ellipses as found by kmean
-    #     X0e, Y0e  = gm0.conf_ellipses()
-    #     P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
-    #     for i in range(1,k):
-    #         P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
-
-    #     # Values found by EM
-    #     Xe, Ye  = lgm.conf_ellipses()
-    #     P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
-    #     for i in range(1,k):
-    #         P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
-    #     P.legend(loc = 0)
-
-    #     # Values found by Online EM
-    #     Xe, Ye  = ogm.conf_ellipses()
-    #     P.plot(Xe[0], Ye[0], 'm', label = 'confidence ellipsoides found by Online EM')
-    #     for i in range(1,k):
-    #         P.plot(Xe[i], Ye[i], 'm', label = '_nolegend_')
-    #     P.legend(loc = 0)
-
-
-    # P.subplot(2, 1, 2)
-    # P.plot(like)
-    # P.title('log likelihood')
-
-    # P.show()

Deleted: trunk/Lib/sandbox/pyem/pyem/profile_data/profile_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/profile_data/profile_densities.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/profile_data/profile_densities.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,54 +0,0 @@
-import numpy as N
-from numpy.random import randn
-from pyem import densities as D
-from pyem import _c_densities as DC
-#import tables
-
-def bench(func, mode = 'diag'):
-    #===========================================
-    # Diag Gaussian of dimension 20
-    #===========================================
-    d       = 30
-    n       = 1e5
-    niter   = 10
-
-    print "Compute %d times densities, %d dimension, %d frames" % (niter, d, n)
-    # Generate a model with k components, d dimensions
-    mu  = randn(1, d)
-    if mode == 'diag':
-        va  = abs(randn(1, d))
-    elif mode == 'full':
-        va  = randn(d, d)
-        va  = N.dot(va, va.transpose())
-
-    X   = randn(n, d)
-    for i in range(niter):
-        Y   = func(X, mu, va)
-
-def benchpy():
-    bench(D.gauss_den)
-
-def benchc():
-    bench(DC.gauss_den)
-
-def benchpyfull():
-    bench(D.gauss_den, 'full')
-
-def benchcfull():
-    bench(DC.gauss_den, 'full')
-
-if __name__ == "__main__":
-    import hotshot, hotshot.stats
-    profile_file    = 'gdenpy.prof'
-    prof    = hotshot.Profile(profile_file, lineevents=1)
-    prof.runcall(benchpy)
-    p = hotshot.stats.load(profile_file)
-    print p.sort_stats('cumulative').print_stats(20)
-    prof.close()
-
-    profile_file    = 'gdenc.prof'
-    prof    = hotshot.Profile(profile_file, lineevents=1)
-    prof.runcall(benchc)
-    p = hotshot.stats.load(profile_file)
-    print p.sort_stats('cumulative').print_stats(20)
-    prof.close()

Deleted: trunk/Lib/sandbox/pyem/pyem/profile_data/profile_gmm.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/profile_data/profile_gmm.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/profile_data/profile_gmm.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,70 +0,0 @@
-import numpy as N
-from pyem import GM, GMM
-import copy
-
-from pyem._c_densities import gauss_den
-
-def bench1(mode = 'diag'):
-    #===========================================
-    # GMM of 20 comp, 20 dimension, 1e4 frames
-    #===========================================
-    d       = 15
-    k       = 30
-    nframes = 1e4
-    niter   = 10
-    mode    = 'diag'
-
-    print "============================================================="
-    print "(%d dim, %d components) GMM with %d iterations, for %d frames" \
-            % (d, k, niter, nframes)
-
-    #+++++++++++++++++++++++++++++++++++++++++++
-    # Create an artificial GMM model, samples it
-    #+++++++++++++++++++++++++++++++++++++++++++
-    print "Generating the mixture"
-    # Generate a model with k components, d dimensions
-    w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
-    # gm          = GM(d, k, mode)
-    # gm.set_param(w, mu, va)
-    gm          = GM.fromvalues(w, mu, va)
-
-    # Sample nframes frames  from the model
-    data    = gm.sample(nframes)
-
-    #++++++++++++++++++++++++
-    # Learn the model with EM
-    #++++++++++++++++++++++++
-
-    # Init the model
-    print "Init a model for learning, with kmean for initialization"
-    lgm = GM(d, k, mode)
-    gmm = GMM(lgm, 'kmean')
-    
-    gmm.init(data)
-    # Keep the initialized model for drawing
-    gm0 = copy.copy(lgm)
-
-    # The actual EM, with likelihood computation
-    like    = N.zeros(niter)
-
-    print "computing..."
-    for i in range(niter):
-        print "iteration %d" % i
-        g, tgd  = gmm.sufficient_statistics(data)
-        like[i] = N.sum(N.log(N.sum(tgd, 1)))
-        gmm.update_em(data, g)
-
-if __name__ == "__main__":
-    import hotshot, hotshot.stats
-    profile_file    = 'gmm.prof'
-    prof    = hotshot.Profile(profile_file, lineevents=1)
-    prof.runcall(bench1)
-    p = hotshot.stats.load(profile_file)
-    print p.sort_stats('cumulative').print_stats(20)
-    prof.close()
-    # import profile
-    # profile.run('bench1()', 'gmmprof')
-    # import pstats
-    # p = pstats.Stats('gmmprof')
-    # print p.sort_stats('cumulative').print_stats(20)
-

Deleted: trunk/Lib/sandbox/pyem/pyem/setup.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/setup.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/setup.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,142 +0,0 @@
-#! /usr/bin/env python
-# Last Change: Fri Oct 06 09:00 PM 2006 J
-# TODO:
-#   - check how to handle cmd line build options with distutils and use
-#   it in the building process
-
-""" pyem is a small python package to estimate Gaussian Mixtures Models
-from data, using Expectation Maximization"""
-
-from os.path import join
-from info import version as pyem_version
-
-DISTNAME    = 'pyem' 
-VERSION     = pyem_version
-DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
-AUTHOR      ='David Cournapeau',
-AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
-URL         ='http://ar.media.kyoto-u.ac.jp/members/david',
-
-def configuration(parent_package='',top_path=None, package_name='pyem'):
-    from numpy.distutils.misc_util import Configuration
-    config = Configuration(package_name,parent_package,top_path,
-             version     = VERSION)
-    config.add_data_dir('tests')
-    config.add_subpackage('profile_data')
-    config.add_extension('c_gden',
-                         #define_macros=[('LIBSVM_EXPORTS', None),
-                         #               ('LIBSVM_DLL', None)],
-                         sources=[join('src', 'c_gden.c')])
-
-    return config
-
-if __name__ == "__main__":
-    from numpy.distutils.core import setup
-    #setup(**configuration(top_path='').todict())
-    setup(**configuration(top_path='',
-                          package_name='scipy.sandbox.pyem').todict())
-# from distutils.core import setup, Extension
-# from pyem import version as pyem_version
-# 
-# # distutils does not update MANIFEST correctly, removes it
-# import os
-# if os.path.exists('MANIFEST'): os.remove('MANIFEST')
-# from os.path import join
-# 
-# import re
-# 
-# from numpy.distutils.misc_util import get_numpy_include_dirs
-# NUMPYINC    = get_numpy_include_dirs()[0]
-# 
-# # General variables:
-# #   - DISTNAME: name of the distributed package
-# #   - VERSION: the version reference is in pyem/__init__.py file
-# #   - other upper cased variables are the same than the corresponding 
-# #   keywords in setup call
-# DISTNAME    = 'pyem' 
-# VERSION     = pyem_version
-# DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
-# AUTHOR      ='David Cournapeau',
-# AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
-# URL         ='http://ar.media.kyoto-u.ac.jp/members/david',
-# 
-# # Source files for extensions
-# 
-# # Functions used to substitute values in File.
-# # Mainly use to replace config.h capabilities
-# def do_subst_in_file(sourcefile, targetfile, dict):
-#     """Replace all instances of the keys of dict with their values.
-#     For example, if dict is {'%VERSION%': '1.2345', '%BASE%': 'MyProg'},
-#     then all instances of %VERSION% in the file will be replaced with 1.2345 etc.
-#     """
-#     try:
-#         f = open(sourcefile, 'rb')
-#         contents = f.read()
-#         f.close()
-#     except:
-#         raise IOError, "Can't read source file %s"%sourcefile
-# 
-#     for (k,v) in dict.items():
-#         contents = re.sub(k, v, contents)
-#     try:
-#         f = open(targetfile, 'wb')
-#         f.write(contents)
-#         f.close()
-#     except:
-#         raise IOError, "Can't read source file %s"%sourcefile
-#     return 0 # success
-#  
-# class SetupOption:
-#     def __init__(self):
-#         self.kmean      = 'py'
-#         self.ext_modules= [Extension(join('pyem', 'c_gden'),
-#                               sources=[join('pyem', 'src', 'c_gden.c')]) ]
-#         self.cmdclass   = {}
-#         self.subsdic     = {'%KMEANIMPORT%': []}
-# 
-#     def _config_kmean(self):
-#         # Check in this order:
-#         #   - kmean in scipy.cluster,
-#         #   - custom vq with pyrex 
-#         #   - custom pure python vq
-#         #try:
-#         #    from scipy.cluster.vq import kmeans
-#         #    self.kmean  = 'scipy'
-#         #    #self.subsdic['%KMEANIMPORT%']   = scipy_kmean
-#         #except ImportError:
-#         #    try:
-#         #        from Pyrex.Distutils import build_ext
-#         #        self.kmean  = 'pyrex'
-#         #        self.ext_modules.append(Extension('pyem/c_gmm', 
-#         #            ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC]))
-#         #        self.cmdclass['build_ext']  = build_ext
-#         #        #self.subsdic['%KMEANIMPORT%']   = pyrex_kmean
-#         #    except ImportError:
-#         #        self.kmean  = 'py'
-#         #        #self.subsdic['%KMEANIMPORT%']   = pyrex_kmean
-#         try:
-#             from Pyrex.Distutils import build_ext
-#             self.kmean  = 'pyrex'
-#             self.ext_modules.append(Extension('pyem/c_gmm', 
-#                 ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC]))
-#             self.cmdclass['build_ext']  = build_ext
-#             #self.subsdic['%KMEANIMPORT%']   = pyrex_kmean
-#         except ImportError:
-#             self.kmean  = 'py'
-#             #self.subsdic['%KMEANIMPORT%']   = pyrex_kmean
-#     def setup(self):
-#         self._config_kmean()
-#         #import time
-#         #do_subst_in_file('pyem/kmean.py.in', 'pyem/kmean.py', self.subsdic)
-#         setup(name      = DISTNAME,
-#             version     = VERSION,
-#             description = DESCRIPTION,
-#             author      = AUTHOR,
-#             author_email= AUTHOR_EMAIL,
-#             url         = URL,
-#             packages    = ['pyem', 'pyem.tests', 'pyem.profile_data'],
-#             ext_modules = self.ext_modules,
-#             cmdclass    = self.cmdclass)
-# 
-# stpobj  = SetupOption()
-# stpobj.setup()

Deleted: trunk/Lib/sandbox/pyem/pyem/src/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/Makefile	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/Makefile	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,32 +0,0 @@
-CC	= colorgcc
-LD	= gcc
-
-PYREX	= python2.4-pyrexc
-
-PYTHONINC	= -I/usr/include/python2.4
-NUMPYINC	= -I/usr/lib/python2.4/site-packages/numpy/core/include
-OPTIMS		= -O3 -funroll-all-loops -march=pentium4 -msse2
-WARN		= -W -Wall
-
-CFLAGS	= $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
-
-c_gden.so: c_gden.o
-	$(LD) -shared -o $@ $< -Wl,-soname,$@
-
-c_gden.o: c_gden.c
-	$(CC) -c $(CFLAGS) -fPIC $<
-
-c_gmm.so: c_gmm.o
-	$(LD) -shared -o $@ $< -Wl,-soname,$@
-
-c_gmm.o: c_gmm.c
-	$(CC) -c $(CFLAGS) -fPIC $<
-
-c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
-	$(PYREX) $<
-
-clean:
-	rm -f c_gmm.c
-	rm -f *.o
-	rm -f *.so
-	rm -f *.pyc

Deleted: trunk/Lib/sandbox/pyem/pyem/src/c_gden.c
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_gden.c	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_gden.c	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,37 +0,0 @@
-#include <stddef.h>
-#include <stdio.h>
-#include <math.h>
-
-/*
- * use va for caching, so its content is modified
- */
-int gden_diag(const double *in, const size_t n, const size_t d, 
-        const double* mu, double* va, double* out)
-{
-    size_t  j, i;
-    double  fac = 1.0;
-    double  acc, tmp;
-
-    /*
-     * Cache some precomputing
-     */
-    for(j = 0; j < d; ++j) {
-        va[j]   = 0.5/va[j];
-        fac     *= sqrt(va[j] / (M_PI) );
-        va[j]   *= -1.0;
-    }
-
-    /*
-     * actual computing
-     */
-    for(i = 0; i < n; ++i) {
-        acc = 0;
-        for(j = 0; j < d; ++j) {
-            tmp = (in[i *d + j] - mu[j]);
-            acc += tmp * tmp * va[j];
-        }
-        out[i]  = exp(acc) * fac;
-    }
- 
-    return 0;
-}

Deleted: trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,66 +0,0 @@
-# Last Change: Thu Jul 13 04:00 PM 2006 J
-
-cimport c_numpy
-cimport c_python
-import numpy
-
-c_numpy.import_array()
-
-# pyrex version of _vq. Much faster in high dimension/high number of K 
-# (ie K > 3-4)
-def _vq(data, init):
-    (n, d)  = data.shape
-    label   = numpy.zeros(n, int)
-    _imp_vq(data, init, label)
-
-    return label
-
-def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label):
-    cdef int n
-    cdef int d
-    cdef int nc
-    cdef int i
-    cdef int j
-    cdef int k
-    cdef int *labeld
-    cdef double *da, *code
-    cdef double dist
-    cdef double acc
-
-    n   = data.dimensions[0]
-    d   = data.dimensions[1]
-    nc  = init.dimensions[0]
-
-    if not data.dtype == numpy.dtype(numpy.float64):
-        print '_vq not (yet) implemented for dtype %s'%dtype.name
-        return
-    da  = (<double*>data.data)
-
-    if not init.dtype == numpy.dtype(numpy.float64):
-        print '_vq not (yet) implemented for dtype %s'%dtype.name
-        return
-    code    = (<double*>init.data)
-
-    if not label.dtype == numpy.dtype(numpy.int32):
-        print '_vq not (yet) implemented for dtype %s'%dtype.name
-        return
-    labeld  = (<int*>label.data)
-
-    for i from 0<=i<n:
-        acc = 0
-        lab = 0
-        for j from 0<=j<d:
-            acc = acc + (da[i * d + j] - code[j]) * \
-                (da[i * d + j] - code[j])
-        dist    = acc
-        for k from 1<=k<nc:
-            acc     = 0
-            for j from 0<=j<d:
-                acc = acc + (da[i * d + j] - code[k * d + j]) * \
-                    (da[i * d + j] - code[k * d + j])
-            if acc < dist:
-                dist    = acc
-                lab     = k
-        labeld[i]   = lab
-
-    return lab

Deleted: trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,59 +0,0 @@
-# :Author:    Robert Kern
-# :Copyright: 2004, Enthought, Inc.
-# :License:   BSD Style
-
-
-cdef extern from "numpy/arrayobject.h":
-    ctypedef enum PyArray_TYPES:
-        PyArray_BOOL
-        PyArray_BYTE
-        PyArray_UBYTE
-        PyArray_SHORT
-        PyArray_USHORT 
-        PyArray_INT
-        PyArray_UINT 
-        PyArray_LONG
-        PyArray_ULONG
-        PyArray_LONGLONG
-        PyArray_ULONGLONG
-        PyArray_FLOAT
-        PyArray_DOUBLE 
-        PyArray_LONGDOUBLE
-        PyArray_CFLOAT
-        PyArray_CDOUBLE
-        PyArray_CLONGDOUBLE
-        PyArray_OBJECT
-        PyArray_STRING
-        PyArray_UNICODE
-        PyArray_VOID
-        PyArray_NTYPES
-        PyArray_NOTYPE
-
-    ctypedef int intp 
-
-    ctypedef extern class numpy.dtype [object PyArray_Descr]:
-        cdef int type_num, elsize, alignment
-        cdef char type, kind, byteorder, hasobject
-        cdef object fields, typeobj
-
-    ctypedef extern class numpy.ndarray [object PyArrayObject]:
-        cdef char *data
-        cdef int nd
-        cdef intp *dimensions
-        cdef intp *strides
-        cdef object base
-        cdef dtype descr
-        cdef int flags
-
-    ndarray PyArray_SimpleNew(int ndims, intp* dims, int item_type)
-    int PyArray_Check(object obj)
-    ndarray PyArray_ContiguousFromObject(object obj, PyArray_TYPES type, 
-        int mindim, int maxdim)
-    intp PyArray_SIZE(ndarray arr)
-    void *PyArray_DATA(ndarray arr)
-    ndarray PyArray_FromAny(object obj, dtype newtype, int mindim, int maxdim,
-		    int requirements, object context)
-    ndarray PyArray_NewFromDescr(object subtype, dtype newtype, int nd, intp* dims,
-		    intp* strides, void* data, int flags, object parent)
-
-    void import_array()

Deleted: trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,20 +0,0 @@
-# -*- Mode: Python -*-  Not really, but close enough
-
-# Expose as much of the Python C API as we need here
-
-cdef extern from "stdlib.h":
-    ctypedef int size_t
-
-cdef extern from "Python.h":
-    ctypedef int Py_intptr_t
-    void*  PyMem_Malloc(size_t)
-    void*  PyMem_Realloc(void *p, size_t n)
-    void   PyMem_Free(void *p)
-    char*  PyString_AsString(object string)
-    object PyString_FromString(char *v)
-    object PyString_InternFromString(char *v)
-    int    PyErr_CheckSignals()
-    object PyFloat_FromDouble(double v)
-    void   Py_XINCREF(object o)
-    void   Py_XDECREF(object o)
-    void   Py_CLEAR(object o) # use instead of decref

Deleted: trunk/Lib/sandbox/pyem/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/tests/test_densities.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/tests/test_densities.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,181 +0,0 @@
-#! /usr/bin/env python
-# Last Change: Fri Sep 29 06:00 PM 2006 J
-
-import sys
-from numpy.testing import *
-
-import numpy as N
-
-set_package_path()
-from pyem.densities import gauss_den
-from pyem._c_densities import gauss_den as c_gauss_den
-
-restore_path()
-
-#Optional:
-set_local_path()
-# import modules that are located in the same directory as this file.
-restore_path()
-
-class test_densities(NumpyTestCase):
-    def _generate_test_data_1d(self):
-        self.va     = 2.0
-        self.mu     = 1.0
-        self.X      = N.linspace(-2, 2, 10)[:, N.newaxis]
-
-        self.Yt     = N.array([0.02973257230591, 0.05512079811082, 0.09257745306945, 
-                0.14086453882683,
-                0.19418015562214, 0.24250166773127, 0.27436665745048, 0.28122547107069,
-                0.26114678964743, 0.21969564473386])
-
-    def _generate_test_data_2d_diag(self):
-        #============================
-        # Small test in 2d (diagonal)
-        #============================
-        self.mu  = N.atleast_2d([-1.0, 2.0])
-        self.va  = N.atleast_2d([2.0, 3.0])
-        
-        self.X  = N.zeros((10, 2))
-        self.X[:,0] = N.linspace(-2, 2, 10)
-        self.X[:,1] = N.linspace(-1, 3, 10)
-
-        self.Yt  = N.array([0.01129091565384, 0.02025416837152, 0.03081845516786, 
-                0.03977576221540, 0.04354490552910, 0.04043592581117, 
-                0.03184994053539, 0.02127948225225, 0.01205937178755, 
-                0.00579694938623 ])
-
-
-    def _generate_test_data_2d_full(self):
-        #============================
-        # Small test in 2d (full mat)
-        #============================
-        self.mu = N.array([[0.2, -1.0]])
-        self.va = N.array([[1.2, 0.1], [0.1, 0.5]])
-        X1      = N.linspace(-2, 2, 10)[:, N.newaxis]
-        X2      = N.linspace(-3, 3, 10)[:, N.newaxis]
-        self.X  = N.concatenate(([X1, X2]), 1)
-        
-        self.Yt = N.array([0.00096157109751, 0.01368908714856,
-            0.07380823191162, 0.15072050533842, 
-            0.11656739937861, 0.03414436965525,
-            0.00378789836599, 0.00015915297541, 
-            0.00000253261067, 0.00000001526368])
-
-    def _check_py(self, level, decimal = 12):
-        Y   = gauss_den(self.X, self.mu, self.va)
-        assert_array_almost_equal(Y, self.Yt, decimal)
-
-    def _check_c(self, level, decimal = 12):
-        Y   = c_gauss_den(self.X, self.mu, self.va)
-        assert_array_almost_equal(Y, self.Yt, decimal)
-
-    def check_py_1d(self, level = 1):
-        self._generate_test_data_1d()
-        self._check_py(level)
-
-    def check_py_2d_diag(self, level = 1):
-        self._generate_test_data_2d_diag()
-        self._check_py(level)
-
-    def check_py_2d_full(self, level = 1):
-        self._generate_test_data_2d_full()
-        self._check_py(level)
-
-    def check_c_1d(self, level = 1):
-        self._generate_test_data_1d()
-        self._check_c(level)
-
-    def check_c_2d_diag(self, level = 1):
-        self._generate_test_data_2d_diag()
-        self._check_c(level)
-
-    def check_c_2d_full(self, level = 1):
-        self._generate_test_data_2d_full()
-        self._check_c(level)
-
-if __name__ == "__main__":
-    NumpyTest().run()
-
-# def generate_test_data(n, d, mode = 'diag', file='test.dat'):
-#     """Generate a set of data of dimension d, with n frames,
-#     that is input data, mean, var and output of gden, so that
-#     other implementations can be tested against"""
-#     mu  = randn(1, d)
-#     if mode == 'diag':
-#         va  = abs(randn(1, d))
-#     elif mode == 'full':
-#         va  = randn(d, d)
-#         va  = dot(va, va.transpose())
-# 
-#     input   = randn(n, d)
-#     output  = gauss_den(input, mu, va)
-# 
-#     import tables
-#     h5file  = tables.openFile(file, "w")
-# 
-#     h5file.createArray(h5file.root, 'input', input)
-#     h5file.createArray(h5file.root, 'mu', mu)
-#     h5file.createArray(h5file.root, 'va', va)
-#     h5file.createArray(h5file.root, 'output', output)
-# 
-#     h5file.close()
-# 
-# def test_gauss_den():
-#     """"""
-#     # import tables
-#     # import numpy as N
-#     # 
-#     # filename    = 'dendata.h5'
-# 
-#     # # # Dimension 1
-#     # # d   = 1
-#     # # mu  = 1.0
-#     # # va  = 2.0
-# 
-#     # # X   = randn(1e3, 1)
-# 
-#     # # Y   = gauss_den(X, mu, va)
-# 
-#     # # h5file      = tables.openFile(filename, "w")
-# 
-#     # # h5file.createArray(h5file.root, 'X', X)
-#     # # h5file.createArray(h5file.root, 'mu', mu)
-#     # # h5file.createArray(h5file.root, 'va', va)
-#     # # h5file.createArray(h5file.root, 'Y', Y)
-# 
-#     # # h5file.close()
-# 
-#     # # # Dimension 2, diag
-#     # # d   = 2
-#     # # mu  = N.array([1.0, -2.0])
-#     # # va  = N.array([1.0, 2.0])
-# 
-#     # # X   = randn(1e3, 2)
-# 
-#     # # Y   = gauss_den(X, mu, va)
-# 
-#     # # h5file      = tables.openFile(filename, "w")
-# 
-#     # # h5file.createArray(h5file.root, 'X', X)
-#     # # h5file.createArray(h5file.root, 'mu', mu)
-#     # # h5file.createArray(h5file.root, 'va', va)
-#     # # h5file.createArray(h5file.root, 'Y', Y)
-# 
-#     # # Dimension 2, full
-#     # d   = 2
-#     # mu  = N.array([[0.2, -1.0]])
-#     # va  = N.array([[1.2, 0.1], [0.1, 0.5]])
-# 
-#     # X   = randn(1e3, 2)
-# 
-#     # Y   = gauss_den(X, mu, va)
-# 
-#     # h5file      = tables.openFile(filename, "w")
-# 
-#     # h5file.createArray(h5file.root, 'X', X)
-#     # h5file.createArray(h5file.root, 'mu', mu)
-#     # h5file.createArray(h5file.root, 'va', va)
-#     # h5file.createArray(h5file.root, 'Y', Y)
-# 
-#     # h5file.close()
-# 

Deleted: trunk/Lib/sandbox/pyem/pyem/tests/test_kmean.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/tests/test_kmean.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/tests/test_kmean.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,46 +0,0 @@
-#! /usr/bin/env python
-# Last Change: Thu Sep 28 01:00 PM 2006 J
-
-import sys
-from numpy.testing import *
-
-import numpy as N
-
-set_package_path()
-from pyem.kmean import kmean
-restore_path()
-
-#Optional:
-set_local_path()
-# import modules that are located in the same directory as this file.
-restore_path()
-
-# Global data
-X   = N.array([[3.0, 3], [4, 3], [4, 2],
-        [9, 2], [5, 1], [6, 2], [9, 4], 
-        [5, 2], [5, 4], [7, 4], [6, 5]])
-
-codet1  = N.array([[3.0000, 3.0000],
-        [6.2000, 4.0000], 
-        [5.8000, 1.8000]])
-        
-codet2  = N.array([[11.0/3, 8.0/3], 
-        [6.7500, 4.2500],
-        [6.2500, 1.7500]])
-
-class test_kmean(NumpyTestCase):
-    def check_iter1(self, level=1):
-        initc   = N.concatenate(([[X[0]], [X[1]], [X[2]]])) 
-        code    = initc.copy()
-        code1   = kmean(X, code, 1)[0]
-
-        assert_array_almost_equal(code1, codet1)
-    def check_iter2(self, level=1):
-        initc   = N.concatenate(([[X[0]], [X[1]], [X[2]]])) 
-        code    = initc.copy()
-        code2   = kmean(X, code, 2)[0]
-
-        assert_array_almost_equal(code2, codet2)
-
-if __name__ == "__main__":
-    NumpyTest().run()

Added: trunk/Lib/sandbox/pyem/setup.py
===================================================================
--- trunk/Lib/sandbox/pyem/setup.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/setup.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,142 @@
+#! /usr/bin/env python
+# Last Change: Fri Oct 06 09:00 PM 2006 J
+# TODO:
+#   - check how to handle cmd line build options with distutils and use
+#   it in the building process
+
+""" pyem is a small python package to estimate Gaussian Mixtures Models
+from data, using Expectation Maximization"""
+
+from os.path import join
+from info import version as pyem_version
+
+DISTNAME    = 'pyem' 
+VERSION     = pyem_version
+DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
+AUTHOR      ='David Cournapeau',
+AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
+URL         ='http://ar.media.kyoto-u.ac.jp/members/david',
+
+def configuration(parent_package='',top_path=None, package_name='pyem'):
+    from numpy.distutils.misc_util import Configuration
+    config = Configuration(package_name,parent_package,top_path,
+             version     = VERSION)
+    config.add_data_dir('tests')
+    config.add_subpackage('profile_data')
+    config.add_extension('c_gden',
+                         #define_macros=[('LIBSVM_EXPORTS', None),
+                         #               ('LIBSVM_DLL', None)],
+                         sources=[join('src', 'c_gden.c')])
+
+    return config
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+    #setup(**configuration(top_path='').todict())
+    setup(**configuration(top_path='',
+                          package_name='scipy.sandbox.pyem').todict())
+# from distutils.core import setup, Extension
+# from pyem import version as pyem_version
+# 
+# # distutils does not update MANIFEST correctly, removes it
+# import os
+# if os.path.exists('MANIFEST'): os.remove('MANIFEST')
+# from os.path import join
+# 
+# import re
+# 
+# from numpy.distutils.misc_util import get_numpy_include_dirs
+# NUMPYINC    = get_numpy_include_dirs()[0]
+# 
+# # General variables:
+# #   - DISTNAME: name of the distributed package
+# #   - VERSION: the version reference is in pyem/__init__.py file
+# #   - other upper cased variables are the same than the corresponding 
+# #   keywords in setup call
+# DISTNAME    = 'pyem' 
+# VERSION     = pyem_version
+# DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
+# AUTHOR      ='David Cournapeau',
+# AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
+# URL         ='http://ar.media.kyoto-u.ac.jp/members/david',
+# 
+# # Source files for extensions
+# 
+# # Functions used to substitute values in File.
+# # Mainly use to replace config.h capabilities
+# def do_subst_in_file(sourcefile, targetfile, dict):
+#     """Replace all instances of the keys of dict with their values.
+#     For example, if dict is {'%VERSION%': '1.2345', '%BASE%': 'MyProg'},
+#     then all instances of %VERSION% in the file will be replaced with 1.2345 etc.
+#     """
+#     try:
+#         f = open(sourcefile, 'rb')
+#         contents = f.read()
+#         f.close()
+#     except:
+#         raise IOError, "Can't read source file %s"%sourcefile
+# 
+#     for (k,v) in dict.items():
+#         contents = re.sub(k, v, contents)
+#     try:
+#         f = open(targetfile, 'wb')
+#         f.write(contents)
+#         f.close()
+#     except:
+#         raise IOError, "Can't read source file %s"%sourcefile
+#     return 0 # success
+#  
+# class SetupOption:
+#     def __init__(self):
+#         self.kmean      = 'py'
+#         self.ext_modules= [Extension(join('pyem', 'c_gden'),
+#                               sources=[join('pyem', 'src', 'c_gden.c')]) ]
+#         self.cmdclass   = {}
+#         self.subsdic     = {'%KMEANIMPORT%': []}
+# 
+#     def _config_kmean(self):
+#         # Check in this order:
+#         #   - kmean in scipy.cluster,
+#         #   - custom vq with pyrex 
+#         #   - custom pure python vq
+#         #try:
+#         #    from scipy.cluster.vq import kmeans
+#         #    self.kmean  = 'scipy'
+#         #    #self.subsdic['%KMEANIMPORT%']   = scipy_kmean
+#         #except ImportError:
+#         #    try:
+#         #        from Pyrex.Distutils import build_ext
+#         #        self.kmean  = 'pyrex'
+#         #        self.ext_modules.append(Extension('pyem/c_gmm', 
+#         #            ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC]))
+#         #        self.cmdclass['build_ext']  = build_ext
+#         #        #self.subsdic['%KMEANIMPORT%']   = pyrex_kmean
+#         #    except ImportError:
+#         #        self.kmean  = 'py'
+#         #        #self.subsdic['%KMEANIMPORT%']   = pyrex_kmean
+#         try:
+#             from Pyrex.Distutils import build_ext
+#             self.kmean  = 'pyrex'
+#             self.ext_modules.append(Extension('pyem/c_gmm', 
+#                 ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC]))
+#             self.cmdclass['build_ext']  = build_ext
+#             #self.subsdic['%KMEANIMPORT%']   = pyrex_kmean
+#         except ImportError:
+#             self.kmean  = 'py'
+#             #self.subsdic['%KMEANIMPORT%']   = pyrex_kmean
+#     def setup(self):
+#         self._config_kmean()
+#         #import time
+#         #do_subst_in_file('pyem/kmean.py.in', 'pyem/kmean.py', self.subsdic)
+#         setup(name      = DISTNAME,
+#             version     = VERSION,
+#             description = DESCRIPTION,
+#             author      = AUTHOR,
+#             author_email= AUTHOR_EMAIL,
+#             url         = URL,
+#             packages    = ['pyem', 'pyem.tests', 'pyem.profile_data'],
+#             ext_modules = self.ext_modules,
+#             cmdclass    = self.cmdclass)
+# 
+# stpobj  = SetupOption()
+# stpobj.setup()

Added: trunk/Lib/sandbox/pyem/src/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/src/Makefile	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/Makefile	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,32 @@
+CC	= colorgcc
+LD	= gcc
+
+PYREX	= python2.4-pyrexc
+
+PYTHONINC	= -I/usr/include/python2.4
+NUMPYINC	= -I/usr/lib/python2.4/site-packages/numpy/core/include
+OPTIMS		= -O3 -funroll-all-loops -march=pentium4 -msse2
+WARN		= -W -Wall
+
+CFLAGS	= $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
+
+c_gden.so: c_gden.o
+	$(LD) -shared -o $@ $< -Wl,-soname,$@
+
+c_gden.o: c_gden.c
+	$(CC) -c $(CFLAGS) -fPIC $<
+
+c_gmm.so: c_gmm.o
+	$(LD) -shared -o $@ $< -Wl,-soname,$@
+
+c_gmm.o: c_gmm.c
+	$(CC) -c $(CFLAGS) -fPIC $<
+
+c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
+	$(PYREX) $<
+
+clean:
+	rm -f c_gmm.c
+	rm -f *.o
+	rm -f *.so
+	rm -f *.pyc

Added: trunk/Lib/sandbox/pyem/src/c_gden.c
===================================================================
--- trunk/Lib/sandbox/pyem/src/c_gden.c	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/c_gden.c	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,37 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <math.h>
+
+/*
+ * use va for caching, so its content is modified
+ */
+int gden_diag(const double *in, const size_t n, const size_t d, 
+        const double* mu, double* va, double* out)
+{
+    size_t  j, i;
+    double  fac = 1.0;
+    double  acc, tmp;
+
+    /*
+     * Cache some precomputing
+     */
+    for(j = 0; j < d; ++j) {
+        va[j]   = 0.5/va[j];
+        fac     *= sqrt(va[j] / (M_PI) );
+        va[j]   *= -1.0;
+    }
+
+    /*
+     * actual computing
+     */
+    for(i = 0; i < n; ++i) {
+        acc = 0;
+        for(j = 0; j < d; ++j) {
+            tmp = (in[i *d + j] - mu[j]);
+            acc += tmp * tmp * va[j];
+        }
+        out[i]  = exp(acc) * fac;
+    }
+ 
+    return 0;
+}

Added: trunk/Lib/sandbox/pyem/src/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/src/c_gmm.pyx	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/c_gmm.pyx	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,66 @@
+# Last Change: Thu Jul 13 04:00 PM 2006 J
+
+cimport c_numpy
+cimport c_python
+import numpy
+
+c_numpy.import_array()
+
+# pyrex version of _vq. Much faster in high dimension/high number of K 
+# (ie K > 3-4)
+def _vq(data, init):
+    (n, d)  = data.shape
+    label   = numpy.zeros(n, int)
+    _imp_vq(data, init, label)
+
+    return label
+
+def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label):
+    cdef int n
+    cdef int d
+    cdef int nc
+    cdef int i
+    cdef int j
+    cdef int k
+    cdef int *labeld
+    cdef double *da, *code
+    cdef double dist
+    cdef double acc
+
+    n   = data.dimensions[0]
+    d   = data.dimensions[1]
+    nc  = init.dimensions[0]
+
+    if not data.dtype == numpy.dtype(numpy.float64):
+        print '_vq not (yet) implemented for dtype %s'%dtype.name
+        return
+    da  = (<double*>data.data)
+
+    if not init.dtype == numpy.dtype(numpy.float64):
+        print '_vq not (yet) implemented for dtype %s'%dtype.name
+        return
+    code    = (<double*>init.data)
+
+    if not label.dtype == numpy.dtype(numpy.int32):
+        print '_vq not (yet) implemented for dtype %s'%dtype.name
+        return
+    labeld  = (<int*>label.data)
+
+    for i from 0<=i<n:
+        acc = 0
+        lab = 0
+        for j from 0<=j<d:
+            acc = acc + (da[i * d + j] - code[j]) * \
+                (da[i * d + j] - code[j])
+        dist    = acc
+        for k from 1<=k<nc:
+            acc     = 0
+            for j from 0<=j<d:
+                acc = acc + (da[i * d + j] - code[k * d + j]) * \
+                    (da[i * d + j] - code[k * d + j])
+            if acc < dist:
+                dist    = acc
+                lab     = k
+        labeld[i]   = lab
+
+    return lab

Added: trunk/Lib/sandbox/pyem/src/c_numpy.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/src/c_numpy.pxd	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/c_numpy.pxd	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,59 @@
+# :Author:    Robert Kern
+# :Copyright: 2004, Enthought, Inc.
+# :License:   BSD Style
+
+
+cdef extern from "numpy/arrayobject.h":
+    ctypedef enum PyArray_TYPES:
+        PyArray_BOOL
+        PyArray_BYTE
+        PyArray_UBYTE
+        PyArray_SHORT
+        PyArray_USHORT 
+        PyArray_INT
+        PyArray_UINT 
+        PyArray_LONG
+        PyArray_ULONG
+        PyArray_LONGLONG
+        PyArray_ULONGLONG
+        PyArray_FLOAT
+        PyArray_DOUBLE 
+        PyArray_LONGDOUBLE
+        PyArray_CFLOAT
+        PyArray_CDOUBLE
+        PyArray_CLONGDOUBLE
+        PyArray_OBJECT
+        PyArray_STRING
+        PyArray_UNICODE
+        PyArray_VOID
+        PyArray_NTYPES
+        PyArray_NOTYPE
+
+    ctypedef int intp 
+
+    ctypedef extern class numpy.dtype [object PyArray_Descr]:
+        cdef int type_num, elsize, alignment
+        cdef char type, kind, byteorder, hasobject
+        cdef object fields, typeobj
+
+    ctypedef extern class numpy.ndarray [object PyArrayObject]:
+        cdef char *data
+        cdef int nd
+        cdef intp *dimensions
+        cdef intp *strides
+        cdef object base
+        cdef dtype descr
+        cdef int flags
+
+    ndarray PyArray_SimpleNew(int ndims, intp* dims, int item_type)
+    int PyArray_Check(object obj)
+    ndarray PyArray_ContiguousFromObject(object obj, PyArray_TYPES type, 
+        int mindim, int maxdim)
+    intp PyArray_SIZE(ndarray arr)
+    void *PyArray_DATA(ndarray arr)
+    ndarray PyArray_FromAny(object obj, dtype newtype, int mindim, int maxdim,
+		    int requirements, object context)
+    ndarray PyArray_NewFromDescr(object subtype, dtype newtype, int nd, intp* dims,
+		    intp* strides, void* data, int flags, object parent)
+
+    void import_array()

Added: trunk/Lib/sandbox/pyem/src/c_python.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/src/c_python.pxd	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/c_python.pxd	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,20 @@
+# -*- Mode: Python -*-  Not really, but close enough
+
+# Expose as much of the Python C API as we need here
+
+cdef extern from "stdlib.h":
+    ctypedef int size_t
+
+cdef extern from "Python.h":
+    ctypedef int Py_intptr_t
+    void*  PyMem_Malloc(size_t)
+    void*  PyMem_Realloc(void *p, size_t n)
+    void   PyMem_Free(void *p)
+    char*  PyString_AsString(object string)
+    object PyString_FromString(char *v)
+    object PyString_InternFromString(char *v)
+    int    PyErr_CheckSignals()
+    object PyFloat_FromDouble(double v)
+    void   Py_XINCREF(object o)
+    void   Py_XDECREF(object o)
+    void   Py_CLEAR(object o) # use instead of decref

Added: trunk/Lib/sandbox/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_densities.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/tests/test_densities.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,181 @@
+#! /usr/bin/env python
+# Last Change: Fri Sep 29 06:00 PM 2006 J
+
+import sys
+from numpy.testing import *
+
+import numpy as N
+
+set_package_path()
+from pyem.densities import gauss_den
+from pyem._c_densities import gauss_den as c_gauss_den
+
+restore_path()
+
+#Optional:
+set_local_path()
+# import modules that are located in the same directory as this file.
+restore_path()
+
+class test_densities(NumpyTestCase):
+    def _generate_test_data_1d(self):
+        self.va     = 2.0
+        self.mu     = 1.0
+        self.X      = N.linspace(-2, 2, 10)[:, N.newaxis]
+
+        self.Yt     = N.array([0.02973257230591, 0.05512079811082, 0.09257745306945, 
+                0.14086453882683,
+                0.19418015562214, 0.24250166773127, 0.27436665745048, 0.28122547107069,
+                0.26114678964743, 0.21969564473386])
+
+    def _generate_test_data_2d_diag(self):
+        #============================
+        # Small test in 2d (diagonal)
+        #============================
+        self.mu  = N.atleast_2d([-1.0, 2.0])
+        self.va  = N.atleast_2d([2.0, 3.0])
+        
+        self.X  = N.zeros((10, 2))
+        self.X[:,0] = N.linspace(-2, 2, 10)
+        self.X[:,1] = N.linspace(-1, 3, 10)
+
+        self.Yt  = N.array([0.01129091565384, 0.02025416837152, 0.03081845516786, 
+                0.03977576221540, 0.04354490552910, 0.04043592581117, 
+                0.03184994053539, 0.02127948225225, 0.01205937178755, 
+                0.00579694938623 ])
+
+
+    def _generate_test_data_2d_full(self):
+        #============================
+        # Small test in 2d (full mat)
+        #============================
+        self.mu = N.array([[0.2, -1.0]])
+        self.va = N.array([[1.2, 0.1], [0.1, 0.5]])
+        X1      = N.linspace(-2, 2, 10)[:, N.newaxis]
+        X2      = N.linspace(-3, 3, 10)[:, N.newaxis]
+        self.X  = N.concatenate(([X1, X2]), 1)
+        
+        self.Yt = N.array([0.00096157109751, 0.01368908714856,
+            0.07380823191162, 0.15072050533842, 
+            0.11656739937861, 0.03414436965525,
+            0.00378789836599, 0.00015915297541, 
+            0.00000253261067, 0.00000001526368])
+
+    def _check_py(self, level, decimal = 12):
+        Y   = gauss_den(self.X, self.mu, self.va)
+        assert_array_almost_equal(Y, self.Yt, decimal)
+
+    def _check_c(self, level, decimal = 12):
+        Y   = c_gauss_den(self.X, self.mu, self.va)
+        assert_array_almost_equal(Y, self.Yt, decimal)
+
+    def check_py_1d(self, level = 1):
+        self._generate_test_data_1d()
+        self._check_py(level)
+
+    def check_py_2d_diag(self, level = 1):
+        self._generate_test_data_2d_diag()
+        self._check_py(level)
+
+    def check_py_2d_full(self, level = 1):
+        self._generate_test_data_2d_full()
+        self._check_py(level)
+
+    def check_c_1d(self, level = 1):
+        self._generate_test_data_1d()
+        self._check_c(level)
+
+    def check_c_2d_diag(self, level = 1):
+        self._generate_test_data_2d_diag()
+        self._check_c(level)
+
+    def check_c_2d_full(self, level = 1):
+        self._generate_test_data_2d_full()
+        self._check_c(level)
+
+if __name__ == "__main__":
+    NumpyTest().run()
+
+# def generate_test_data(n, d, mode = 'diag', file='test.dat'):
+#     """Generate a set of data of dimension d, with n frames,
+#     that is input data, mean, var and output of gden, so that
+#     other implementations can be tested against"""
+#     mu  = randn(1, d)
+#     if mode == 'diag':
+#         va  = abs(randn(1, d))
+#     elif mode == 'full':
+#         va  = randn(d, d)
+#         va  = dot(va, va.transpose())
+# 
+#     input   = randn(n, d)
+#     output  = gauss_den(input, mu, va)
+# 
+#     import tables
+#     h5file  = tables.openFile(file, "w")
+# 
+#     h5file.createArray(h5file.root, 'input', input)
+#     h5file.createArray(h5file.root, 'mu', mu)
+#     h5file.createArray(h5file.root, 'va', va)
+#     h5file.createArray(h5file.root, 'output', output)
+# 
+#     h5file.close()
+# 
+# def test_gauss_den():
+#     """"""
+#     # import tables
+#     # import numpy as N
+#     # 
+#     # filename    = 'dendata.h5'
+# 
+#     # # # Dimension 1
+#     # # d   = 1
+#     # # mu  = 1.0
+#     # # va  = 2.0
+# 
+#     # # X   = randn(1e3, 1)
+# 
+#     # # Y   = gauss_den(X, mu, va)
+# 
+#     # # h5file      = tables.openFile(filename, "w")
+# 
+#     # # h5file.createArray(h5file.root, 'X', X)
+#     # # h5file.createArray(h5file.root, 'mu', mu)
+#     # # h5file.createArray(h5file.root, 'va', va)
+#     # # h5file.createArray(h5file.root, 'Y', Y)
+# 
+#     # # h5file.close()
+# 
+#     # # # Dimension 2, diag
+#     # # d   = 2
+#     # # mu  = N.array([1.0, -2.0])
+#     # # va  = N.array([1.0, 2.0])
+# 
+#     # # X   = randn(1e3, 2)
+# 
+#     # # Y   = gauss_den(X, mu, va)
+# 
+#     # # h5file      = tables.openFile(filename, "w")
+# 
+#     # # h5file.createArray(h5file.root, 'X', X)
+#     # # h5file.createArray(h5file.root, 'mu', mu)
+#     # # h5file.createArray(h5file.root, 'va', va)
+#     # # h5file.createArray(h5file.root, 'Y', Y)
+# 
+#     # # Dimension 2, full
+#     # d   = 2
+#     # mu  = N.array([[0.2, -1.0]])
+#     # va  = N.array([[1.2, 0.1], [0.1, 0.5]])
+# 
+#     # X   = randn(1e3, 2)
+# 
+#     # Y   = gauss_den(X, mu, va)
+# 
+#     # h5file      = tables.openFile(filename, "w")
+# 
+#     # h5file.createArray(h5file.root, 'X', X)
+#     # h5file.createArray(h5file.root, 'mu', mu)
+#     # h5file.createArray(h5file.root, 'va', va)
+#     # h5file.createArray(h5file.root, 'Y', Y)
+# 
+#     # h5file.close()
+# 

Added: trunk/Lib/sandbox/pyem/tests/test_kmean.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_kmean.py	2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/tests/test_kmean.py	2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,46 @@
+#! /usr/bin/env python
+# Last Change: Thu Sep 28 01:00 PM 2006 J
+
+import sys
+from numpy.testing import *
+
+import numpy as N
+
+set_package_path()
+from pyem.kmean import kmean
+restore_path()
+
+#Optional:
+set_local_path()
+# import modules that are located in the same directory as this file.
+restore_path()
+
+# Global data
+X   = N.array([[3.0, 3], [4, 3], [4, 2],
+        [9, 2], [5, 1], [6, 2], [9, 4], 
+        [5, 2], [5, 4], [7, 4], [6, 5]])
+
+codet1  = N.array([[3.0000, 3.0000],
+        [6.2000, 4.0000], 
+        [5.8000, 1.8000]])
+        
+codet2  = N.array([[11.0/3, 8.0/3], 
+        [6.7500, 4.2500],
+        [6.2500, 1.7500]])
+
+class test_kmean(NumpyTestCase):
+    def check_iter1(self, level=1):
+        initc   = N.concatenate(([[X[0]], [X[1]], [X[2]]])) 
+        code    = initc.copy()
+        code1   = kmean(X, code, 1)[0]
+
+        assert_array_almost_equal(code1, codet1)
+    def check_iter2(self, level=1):
+        initc   = N.concatenate(([[X[0]], [X[1]], [X[2]]])) 
+        code    = initc.copy()
+        code2   = kmean(X, code, 2)[0]
+
+        assert_array_almost_equal(code2, codet2)
+
+if __name__ == "__main__":
+    NumpyTest().run()