[Scipy-svn] r2285 - in trunk/Lib/sandbox/pyem: . profile_data pyem pyem/profile_data pyem/src pyem/tests src tests
scipy-svn at scipy.org
scipy-svn at scipy.org
Thu Oct 12 09:49:18 EDT 2006
Author: cdavid
Date: 2006-10-12 08:48:58 -0500 (Thu, 12 Oct 2006)
New Revision: 2285
Added:
trunk/Lib/sandbox/pyem/__init__.py
trunk/Lib/sandbox/pyem/_c_densities.py
trunk/Lib/sandbox/pyem/densities.py
trunk/Lib/sandbox/pyem/gauss_mix.py
trunk/Lib/sandbox/pyem/gmm_em.py
trunk/Lib/sandbox/pyem/info.py
trunk/Lib/sandbox/pyem/kmean.py
trunk/Lib/sandbox/pyem/online_em.py
trunk/Lib/sandbox/pyem/profile_data/
trunk/Lib/sandbox/pyem/profile_data/profile_densities.py
trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py
trunk/Lib/sandbox/pyem/setup.py
trunk/Lib/sandbox/pyem/src/
trunk/Lib/sandbox/pyem/src/Makefile
trunk/Lib/sandbox/pyem/src/c_gden.c
trunk/Lib/sandbox/pyem/src/c_gmm.pyx
trunk/Lib/sandbox/pyem/src/c_numpy.pxd
trunk/Lib/sandbox/pyem/src/c_python.pxd
trunk/Lib/sandbox/pyem/tests/
trunk/Lib/sandbox/pyem/tests/test_densities.py
trunk/Lib/sandbox/pyem/tests/test_kmean.py
Removed:
trunk/Lib/sandbox/pyem/count.sh
trunk/Lib/sandbox/pyem/pyem/__init__.py
trunk/Lib/sandbox/pyem/pyem/_c_densities.py
trunk/Lib/sandbox/pyem/pyem/densities.py
trunk/Lib/sandbox/pyem/pyem/gauss_mix.py
trunk/Lib/sandbox/pyem/pyem/gmm_em.py
trunk/Lib/sandbox/pyem/pyem/info.py
trunk/Lib/sandbox/pyem/pyem/kmean.py
trunk/Lib/sandbox/pyem/pyem/online_em.py
trunk/Lib/sandbox/pyem/pyem/profile_data/profile_densities.py
trunk/Lib/sandbox/pyem/pyem/profile_data/profile_gmm.py
trunk/Lib/sandbox/pyem/pyem/setup.py
trunk/Lib/sandbox/pyem/pyem/src/Makefile
trunk/Lib/sandbox/pyem/pyem/src/c_gden.c
trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd
trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd
trunk/Lib/sandbox/pyem/pyem/tests/test_densities.py
trunk/Lib/sandbox/pyem/pyem/tests/test_kmean.py
Log:
[pyem @ david at ar.media.kyoto-u.ac.jp-20061012122133-98999563270b6770]
Change of layout for scipy (2)
David Cournapeau <david at ar.media.kyoto-u.ac.jp> | 2006-10-12 21:21:33 +0900 (Thu, 12 Oct 2006)
Added: trunk/Lib/sandbox/pyem/__init__.py
===================================================================
--- trunk/Lib/sandbox/pyem/__init__.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/__init__.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,12 @@
+#! /usr/bin/env python
+# Last Change: Tue Oct 03 06:00 PM 2006 J
+
+from info import __doc__
+
+from numpy.testing import NumpyTest
+test = NumpyTest().test
+
+version = '0.5.3'
+
+from gauss_mix import GmParamError, GM
+from gmm_em import GmmParamError, GMM, EM
Added: trunk/Lib/sandbox/pyem/_c_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/_c_densities.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/_c_densities.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,217 @@
+#! /usr/bin/python
+#
+# Copyrighted David Cournapeau
+# Last Change: Tue Oct 03 05:00 PM 2006 J
+
+# This module uses a C implementation through ctypes, for diagonal cases
+# TODO:
+# - portable way to find/open the shared library
+# - full cov matrice
+
+import numpy as N
+import numpy.linalg as lin
+from numpy.random import randn
+from scipy.stats import chi2
+import densities as D
+
+import ctypes
+from ctypes import cdll, c_uint, c_int, c_double, POINTER
+from numpy.ctypeslib import ndpointer, load_library
+
+ctypes_major = int(ctypes.__version__.split('.')[0])
+if ctypes_major < 1:
+ msg = "version of ctypes is %s, expected at least %s" \
+ % (ctypes.__version__, '1.0.0')
+ raise Exception(msg)
+
+# Requirements for diag gden
+_gden = load_library('c_gden.so', __file__)
+arg1 = ndpointer(dtype='<f8')
+arg2 = c_uint
+arg3 = c_uint
+arg4 = ndpointer(dtype='<f8')
+arg5 = ndpointer(dtype='<f8')
+arg6 = ndpointer(dtype='<f8')
+_gden.gden_diag.argtypes = [arg1, arg2, arg3, arg4, arg5, arg6]
+_gden.gden_diag.restype = c_int
+
+# Error classes
+class DenError(Exception):
+ """Base class for exceptions in this module.
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error"""
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+# The following function do all the fancy stuff to check that parameters
+# are Ok, and call the right implementation if args are OK.
+def gauss_den(x, mu, va, log = False):
+ """ Compute multivariate Gaussian density at points x for
+ mean mu and variance va.
+
+ Vector are row vectors, except va which can be a matrix
+ (row vector variance for diagonal variance)
+
+ If log is True, than the log density is returned
+ (useful for underflow ?)"""
+ mu = N.atleast_2d(mu)
+ va = N.atleast_2d(va)
+ x = N.atleast_2d(x)
+
+ #=======================#
+ # Checking parameters #
+ #=======================#
+ if len(N.shape(mu)) != 2:
+ raise DenError("mu is not rank 2")
+
+ if len(N.shape(va)) != 2:
+ raise DenError("va is not rank 2")
+
+ if len(N.shape(x)) != 2:
+ raise DenError("x is not rank 2")
+
+ (n, d) = x.shape
+ (dm0, dm1) = mu.shape
+ (dv0, dv1) = va.shape
+
+ # Check x and mu same dimension
+ if dm0 != 1:
+ msg = "mean must be a row vector!"
+ raise DenError(msg)
+ if dm1 != d:
+ msg = "x and mu not same dim"
+ raise DenError(msg)
+ # Check va and mu same size
+ if dv1 != d:
+ msg = "mu and va not same dim"
+ raise DenError(msg)
+ if dv0 != 1 and dv0 != d:
+ msg = "va not square"
+ raise DenError(msg)
+
+ #===============#
+ # Computation #
+ #===============#
+ if d == 1:
+ # scalar case
+ return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log)
+ elif dv0 == 1:
+ # Diagonal matrix case
+ return _diag_gauss_den(x, mu, va, log)
+ elif dv1 == dv0:
+ # full case
+ return _full_gauss_den(x, mu, va, log)
+ else:
+ raise DenError("variance mode not recognized, this is a bug")
+
+# Those 3 functions do almost all the actual computation
+def _scalar_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ ** Expect centered data (ie with mean removed) **
+
+ Call gauss_den instead"""
+ d = mu.size
+ inva = 1/va
+ fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+ y = ((x-mu) ** 2) * -0.5 * inva
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + log(fac)
+
+ return y
+
+def _diag_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ ** Expect centered data (ie with mean removed) **
+
+ Call gauss_den instead"""
+ # Diagonal matrix case
+ d = mu.size
+ n = x.shape[0]
+ if not log:
+ y = N.zeros(n)
+ vat = va.copy()
+ # _gden.gden_diag(N.require(x, requirements = 'C'), n, d,
+ # N.require(mu, requirements = 'C'),
+ # N.require(inva, requirements = 'C'),
+ # N.require(y, requirements = 'C'))
+ x = N.require(x, requirements = 'C')
+ mu = N.require(mu, requirements = 'C')
+ vat = N.require(vat, requirements = 'C')
+ y = N.require(y, requirements = 'C')
+ _gden.gden_diag(x, n, d, mu, vat, y)
+ return y
+ # _gden.gden_diag.restype = c_int
+ # _gden.gden_diag.argtypes = [POINTER(c_double), c_uint, c_uint,
+ # POINTER(c_double), POINTER(c_double), POINTER(c_double)]
+
+ # y = N.zeros(n)
+ # inva= 1/va
+ # _gden.gden_diag(x.ctypes.data_as(POINTER(c_double)),
+ # n, d,
+ # mu.ctypes.data_as(POINTER(c_double)),
+ # inva.ctypes.data_as(POINTER(c_double)),
+ # y.ctypes.data_as(POINTER(c_double)))
+ else:
+ y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
+ for i in range(1, d):
+ y += _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+ return y
+
+def _full_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in full matrix case.
+
+ It assumes all args are conformant, so it should
+ not be used directly Call gauss_den instead
+
+ ** Expect centered data (ie with mean removed) **
+
+ Does not check if va is definite positive (on inversible
+ for that matter), so the inverse computation and/or determinant
+ would throw an exception."""
+ d = mu.size
+ inva = lin.inv(va)
+ fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
+
+ # we are using a trick with sum to "emulate"
+ # the matrix multiplication inva * x without any explicit loop
+ y = N.dot((x-mu), inva)
+ y = -0.5 * N.sum(y * (x-mu), 1)
+
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + N.log(fac)
+
+ return y
+
+if __name__ == "__main__":
+ #=========================================
+ # Test accuracy between pure and C python
+ #=========================================
+ mu = N.array([2.0, 3])
+ va = N.array([5.0, 3])
+
+ # Generate a multivariate gaussian of mean mu and covariance va
+ nframes = 1e4
+ X = randn(nframes, 2)
+ Yc = N.dot(N.diag(N.sqrt(va)), X.transpose())
+ Yc = Yc.transpose() + mu
+
+ Y = D.gauss_den(Yc, mu, va)
+ Yt = gauss_den(Yc, mu, va)
+
+ print "Diff is " + str(N.sqrt(N.sum((Y-Yt) ** 2))/nframes/2)
Deleted: trunk/Lib/sandbox/pyem/count.sh
===================================================================
--- trunk/Lib/sandbox/pyem/count.sh 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/count.sh 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,23 +0,0 @@
-#! /bin/sh
-# Last Change: Fri Oct 06 08:00 PM 2006 J
-
-n=0
-np=0
-nc=0
-
-files=`find . -name '*.py'`
-
-for i in $files; do
- tp=`wc "$i" | tr -s " " | cut -f 2 -d" "`
- let np="$np + $tp"
-done
-
-files=`find . -name '*.[ch]'`
-
-for i in $files; do
- tp=`wc "$i" | tr -s " " | cut -f 2 -d" "`
- let nc="$nc + $tp"
-done
-
-echo "$nc lines of C code"
-echo "$np lines of python code"
Added: trunk/Lib/sandbox/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/densities.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/densities.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,257 @@
+#! /usr/bin/python
+#
+# Copyrighted David Cournapeau
+# Last Change: Thu Oct 05 07:00 PM 2006 J
+
+import numpy as N
+import numpy.linalg as lin
+from numpy.random import randn
+from scipy.stats import chi2
+
+# Error classes
+class DenError(Exception):
+ """Base class for exceptions in this module.
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error"""
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+# The following function do all the fancy stuff to check that parameters
+# are Ok, and call the right implementation if args are OK.
+def gauss_den(x, mu, va, log = False):
+ """ Compute multivariate Gaussian density at points x for
+ mean mu and variance va.
+
+ Vector are row vectors, except va which can be a matrix
+ (row vector variance for diagonal variance)
+
+ If log is True, than the log density is returned
+ (useful for underflow ?)"""
+ mu = N.atleast_2d(mu)
+ va = N.atleast_2d(va)
+ x = N.atleast_2d(x)
+
+ #=======================#
+ # Checking parameters #
+ #=======================#
+ if len(N.shape(mu)) != 2:
+ raise DenError("mu is not rank 2")
+
+ if len(N.shape(va)) != 2:
+ raise DenError("va is not rank 2")
+
+ if len(N.shape(x)) != 2:
+ raise DenError("x is not rank 2")
+
+ (n, d) = x.shape
+ (dm0, dm1) = mu.shape
+ (dv0, dv1) = va.shape
+
+ # Check x and mu same dimension
+ if dm0 != 1:
+ msg = "mean must be a row vector!"
+ raise DenError(msg)
+ if dm1 != d:
+ msg = "x and mu not same dim"
+ raise DenError(msg)
+ # Check va and mu same size
+ if dv1 != d:
+ msg = "mu and va not same dim"
+ raise DenError(msg)
+ if dv0 != 1 and dv0 != d:
+ msg = "va not square"
+ raise DenError(msg)
+
+ #===============#
+ # Computation #
+ #===============#
+ if d == 1:
+ # scalar case
+ return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log)
+ elif dv0 == 1:
+ # Diagonal matrix case
+ return _diag_gauss_den(x, mu, va, log)
+ elif dv1 == dv0:
+ # full case
+ return _full_gauss_den(x, mu, va, log)
+ else:
+ raise DenError("variance mode not recognized, this is a bug")
+
+# Those 3 functions do almost all the actual computation
+def _scalar_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ Call gauss_den instead"""
+ d = mu.size
+ inva = 1/va
+ fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+ y = ((x-mu) ** 2) * -0.5 * inva
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + log(fac)
+
+ return y
+
+#from ctypes import cdll, c_uint, c_int, c_double, POINTER
+#_gden = cdll.LoadLibrary('src/libgden.so')
+#_gden.gden_diag.restype = c_int
+#_gden.gden_diag.argtypes = [POINTER(c_double), c_uint, c_uint,
+# POINTER(c_double), POINTER(c_double), POINTER(c_double)]
+
+def _diag_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ Call gauss_den instead"""
+ # Diagonal matrix case
+ d = mu.size
+ n = x.shape[0]
+ if not log:
+ inva = 1/va[0,0]
+ fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+ y = (x[:,0] - mu[0,0]) ** 2 * inva * -0.5
+ for i in range(1, d):
+ inva = 1/va[0,i]
+ fac *= N.sqrt(inva)
+ y += (x[:,i] - mu[0,i]) ** 2 * inva * -0.5
+ y = fac * N.exp(y)
+ else:
+ y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
+ for i in range(1, d):
+ y += _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+ return y
+
+def _full_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in full matrix case.
+
+ It assumes all args are conformant, so it should
+ not be used directly Call gauss_den instead
+
+ Does not check if va is definite positive (on inversible
+ for that matter), so the inverse computation and/or determinant
+ would throw an exception."""
+ d = mu.size
+ inva = lin.inv(va)
+ fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
+
+ # # Slow version
+ # n = N.size(x, 0)
+ # y = N.zeros(n)
+ # for i in range(n):
+ # y[i] = N.dot(x[i,:],
+ # N.dot(inva, N.transpose(x[i,:])))
+ # y *= -0.5
+
+ # we are using a trick with sum to "emulate"
+ # the matrix multiplication inva * x without any explicit loop
+ y = N.dot((x-mu), inva)
+ y = -0.5 * N.sum(y * (x-mu), 1)
+
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + N.log(fac)
+
+ return y
+
+# To plot a confidence ellipse from multi-variate gaussian pdf
+def gauss_ell(mu, va, dim = [0, 1], npoints = 100, level = 0.39):
+ """ Given a mean and covariance for multi-variate
+ gaussian, returns npoints points for the ellipse
+ of confidence given by level (all points will be inside
+ the ellipsoides with a probability equal to level)
+
+ Returns the coordinate x and y of the ellipse"""
+
+ mu = N.atleast_1d(mu)
+ va = N.atleast_1d(va)
+ c = N.array(dim)
+
+ if mu.size == va.size:
+ mode = 'diag'
+ else:
+ if va.ndim == 2:
+ if va.shape[0] == va.shape[1]:
+ mode = 'full'
+ else:
+ raise DenError("variance not square")
+ else:
+ raise DenError("mean and variance are not dim conformant")
+
+ chi22d = chi2(2)
+ mahal = N.sqrt(chi22d.ppf(level))
+
+ # Generates a circle of npoints
+ theta = N.linspace(0, 2 * N.pi, npoints)
+ circle = mahal * N.array([N.cos(theta), N.sin(theta)])
+
+ # Get the dimension which we are interested in:
+ mu = mu[dim]
+ if mode == 'diag':
+ va = va[dim]
+ elps = N.outer(mu, N.ones(npoints))
+ elps += N.dot(N.diag(N.sqrt(va)), circle)
+ elif mode == 'full':
+ va = va[c,:][:,c]
+ # Method: compute the cholesky decomp of each cov matrix, that is
+ # compute cova such as va = cova * cova'
+ # WARN: scipy is different than matlab here, as scipy computes a lower
+ # triangular cholesky decomp:
+ # - va = cova * cova' (scipy)
+ # - va = cova' * cova (matlab)
+ # So take care when comparing results with matlab !
+ cova = lin.cholesky(va)
+ elps = N.outer(mu, N.ones(npoints))
+ elps += N.dot(cova, circle)
+ else:
+ raise DenParam("var mode not recognized")
+
+ return elps[0, :], elps[1, :]
+
+if __name__ == "__main__":
+ import pylab
+
+ #=========================================
+ # Test plotting a simple diag 2d variance:
+ #=========================================
+ va = N.array([5, 3])
+ mu = N.array([2, 3])
+
+ # Generate a multivariate gaussian of mean mu and covariance va
+ X = randn(1e3, 2)
+ Yc = N.dot(N.diag(N.sqrt(va)), X.transpose())
+ Yc = Yc.transpose() + mu
+
+ # Plotting
+ Xe, Ye = gauss_ell(mu, va, npoints = 100)
+ pylab.figure()
+ pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+ pylab.plot(Xe, Ye, 'r')
+
+ #=========================================
+ # Test plotting a simple full 2d variance:
+ #=========================================
+ va = N.array([[0.2, 0.1],[0.1, 0.5]])
+ mu = N.array([0, 3])
+
+ # Generate a multivariate gaussian of mean mu and covariance va
+ X = randn(1e3, 2)
+ Yc = N.dot(lin.cholesky(va), X.transpose())
+ Yc = Yc.transpose() + mu
+
+ # Plotting
+ Xe, Ye = gauss_ell(mu, va, npoints = 100, level=0.95)
+ pylab.figure()
+ pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+ pylab.plot(Xe, Ye, 'r')
+ pylab.show()
Added: trunk/Lib/sandbox/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/gauss_mix.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/gauss_mix.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,438 @@
+# /usr/bin/python
+# Last Change: Tue Oct 03 06:00 PM 2006 J
+
+# Module to implement GaussianMixture class.
+
+import numpy as N
+from numpy.random import randn, rand
+import numpy.linalg as lin
+import densities
+
+MAX_DEV = 1e-10
+MAX_COND = 1e10
+
+# Right now, two main usages of a Gaussian Model are possible
+# - init a Gaussian Model with meta-parameters, and trains it
+# - set-up a Gaussian Model to sample it, draw ellipsoides
+# of confidences. In this case, we would like to init it with
+# known values of parameters. This can be done with the class method
+# fromval
+#
+# For now, we have to init with meta-parameters, and set
+# the parameters afterward. There should be a better way ?
+
+# TODO:
+# - change bounds methods of GM class instanciations so that it cannot
+# be used as long as w, mu and va are not set
+# - We have to use scipy now for chisquare pdf, so there may be other
+# methods to be used, ie for implementing random index.
+# - there is no check on internal state of the GM, that is does w, mu and va values
+# make sense (eg singular values)
+# - plot1d is still very rhough. There should be a sensible way to
+# modify the result plot (maybe returns a dic with global pdf, component pdf and
+# fill matplotlib handles). Should be coherent with plot
+class GmParamError:
+ """Exception raised for errors in gmm params
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error
+ """
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+class GM:
+ """Gaussian Mixture class. This is a simple container class
+ to hold Gaussian Mixture parameters (weights, mean, etc...).
+ It can also draw itself (confidence ellipses) and samples itself.
+
+ Is initiated by giving dimension, number of components and
+ covariance mode"""
+
+ # I am not sure it is useful to have a spherical mode...
+ _cov_mod = ['diag', 'full']
+
+ def __init__(self, d, k, mode = 'diag'):
+ """Init a Gaussian model of k components, each component being a
+ d multi-variate Gaussian, with covariance matrix of style mode"""
+ if mode not in self._cov_mod:
+ raise GmParamError("mode %s not recognized" + str(mode))
+
+ self.d = d
+ self.k = k
+ self.mode = mode
+
+ # Init to 0 all parameters, with the right dimensions.
+ # Not sure this is useful in python from an efficiency POV ?
+ self.w = N.zeros(k)
+ self.mu = N.zeros((k, d))
+ if mode == 'diag':
+ self.va = N.zeros((k, d))
+ elif mode == 'full':
+ self.va = N.zeros((k * d, d))
+
+ self.is_valid = False
+
+ def set_param(self, weights, mu, sigma):
+ """Set parameters of the model. Args should
+ be conformant with metparameters d and k given during
+ initialisation"""
+ k, d, mode = check_gmm_param(weights, mu, sigma)
+ if not k == self.k:
+ raise GmParamError("Number of given components is %d, expected %d"
+ % (shape(k), shape(self.k)))
+ if not d == self.d:
+ raise GmParamError("Dimension of the given model is %d, expected %d"
+ % (shape(d), shape(self.d)))
+ if not mode == self.mode and not d == 1:
+ raise GmParamError("Given covariance mode is %s, expected %s"
+ % (mode, self.mode))
+ self.w = weights
+ self.mu = mu
+ self.va = sigma
+
+ self.is_valid = True
+
+ @classmethod
+ def fromvalues(cls, weights, mu, sigma):
+ """This class method can be used to create a GM model
+ directly from its parameters weights, mean and variance
+
+ w, mu, va = GM.gen_param(d, k)
+ gm = GM(d, k)
+ gm.set_param(w, mu, va)
+
+ and
+
+ w, mu, va = GM.gen_param(d, k)
+ gm = GM.fromvalue(w, mu, va)
+
+ Are equivalent """
+ k, d, mode = check_gmm_param(weights, mu, sigma)
+ res = cls(d, k, mode)
+ res.set_param(weights, mu, sigma)
+ return res
+
+ def sample(self, nframes):
+ """ Sample nframes frames from the model """
+ if not self.is_valid:
+ raise GmParamError("""Parameters of the model has not been
+ set yet, please set them using self.set_param()""")
+
+ # State index (ie hidden var)
+ S = gen_rand_index(self.w, nframes)
+ # standard gaussian
+ X = randn(nframes, self.d)
+
+ if self.mode == 'diag':
+ X = self.mu[S, :] + X * N.sqrt(self.va[S,:])
+ elif self.mode == 'full':
+ # Faster:
+ cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1]))
+ for i in range(self.k):
+ # Using cholesky looks more stable than sqrtm; sqrtm is not
+ # available in numpy anyway, only in scipy...
+ cho[i] = lin.cholesky(self.va[i*self.d:i*self.d+self.d,:])
+
+ for s in range(self.k):
+ tmpind = N.where(S == s)[0]
+ X[tmpind] = N.dot(X[tmpind], cho[s].transpose()) + self.mu[s]
+ else:
+ raise GmParamError('cov matrix mode not recognized, this is a bug !')
+
+ return X
+
+ def conf_ellipses(self, *args, **kargs):
+ """Returns a list of confidence ellipsoids describing the Gmm
+ defined by mu and va. Check densities.gauss_ell for details
+
+ Returns:
+ -Xe: a list of x coordinates for the ellipses (Xe[i] is
+ the array containing x coordinates of the ith Gaussian)
+ -Ye: a list of y coordinates for the ellipses
+
+ Example:
+ Suppose we have w, mu and va as parameters for a mixture, then:
+
+ gm = GM(d, k)
+ gm.set_param(w, mu, va)
+ X = gm.sample(1000)
+ Xe, Ye = gm.conf_ellipsoids()
+ pylab.plot(X[:,0], X[:, 1], '.')
+ for k in len(w):
+ pylab.plot(Xe[k], Ye[k], 'r')
+
+ Will plot samples X draw from the mixture model, and
+ plot the ellipses of equi-probability from the mean with
+ fixed level of confidence 0.39. """
+ if not self.is_valid:
+ raise GmParamError("""Parameters of the model has not been
+ set yet, please set them using self.set_param()""")
+
+ Xe = []
+ Ye = []
+ if self.mode == 'diag':
+ for i in range(self.k):
+ xe, ye = densities.gauss_ell(self.mu[i,:], self.va[i,:],
+ *args, **kargs)
+ Xe.append(xe)
+ Ye.append(ye)
+ elif self.mode == 'full':
+ for i in range(self.k):
+ xe, ye = densities.gauss_ell(self.mu[i,:],
+ self.va[i*self.d:i*self.d+self.d,:],
+ *args, **kargs)
+ Xe.append(xe)
+ Ye.append(ye)
+
+ return Xe, Ye
+
+ def check_state(self):
+ """
+ """
+ if not self.is_valid:
+ raise GmParamError("""Parameters of the model has not been
+ set yet, please set them using self.set_param()""")
+
+ if self.mode == 'full':
+ raise NotImplementedError, "not implemented for full mode yet"
+
+ # # How to check w: if one component is negligeable, what shall
+ # # we do ?
+ # M = N.max(self.w)
+ # m = N.min(self.w)
+
+ # maxc = m / M
+
+ # Check condition number for cov matrix
+ cond = N.zeros(self.k)
+ ava = N.absolute(self.va)
+ for c in range(self.k):
+ cond[c] = N.amax(ava[c,:]) / N.amin(ava[c,:])
+
+ print cond
+
+ def gen_param(self, d, nc, varmode = 'diag', spread = 1):
+ """Generate valid parameters for a gaussian mixture model.
+ d is the dimension, nc the number of components, and varmode
+ the mode for cov matrices.
+
+ This is a class method.
+
+ Returns: w, mu, va
+ """
+ w = abs(randn(nc))
+ w = w / sum(w, 0)
+
+ mu = spread * randn(nc, d)
+ if varmode == 'diag':
+ va = abs(randn(nc, d))
+ elif varmode == 'full':
+ va = randn(nc * d, d)
+ for k in range(nc):
+ va[k*d:k*d+d] = N.dot( va[k*d:k*d+d],
+ va[k*d:k*d+d].transpose())
+ else:
+ raise GmParamError('cov matrix mode not recognized')
+
+ return w, mu, va
+
+ gen_param = classmethod(gen_param)
+
+ def plot(self, *args, **kargs):
+ """Plot the ellipsoides directly for the model
+
+ Returns a list of lines, so that their style can be modified. By default,
+ the style is red color, and nolegend for all of them.
+
+ Does not work for 1d"""
+ if not self.is_valid:
+ raise GmParamError("""Parameters of the model has not been
+ set yet, please set them using self.set_param()""")
+
+ k = self.k
+ Xe, Ye = self.conf_ellipses(*args, **kargs)
+ try:
+ import pylab as P
+ return [P.plot(Xe[i], Ye[i], 'r', label='_nolegend_')[0] for i in range(k)]
+ #for i in range(k):
+ # P.plot(Xe[i], Ye[i], 'r')
+ except ImportError:
+ raise GmParamError("matplotlib not found, cannot plot...")
+
+ def plot1d(self, level = 0.5, fill = 0, gpdf = 0):
+ """This function plots the pdfs of each component of the model.
+ If gpdf is 1, also plots the global pdf. If fill is 1, fill confidence
+ areas using level argument as a level value
+
+ Returns a dictionary h of plot handles so that their properties can
+ be modified (eg color, label, etc...):
+ - h['pdf'] is a list of lines, one line per component pdf
+ - h['gpdf'] is the line for the global pdf
+ - h['conf'] is a list of filling area
+ """
+ # This is not optimized at all, may be slow. Should not be
+ # difficult to make much faster, but it is late, and I am lazy
+ if not self.d == 1:
+ raise GmParamError("the model is not one dimensional model")
+ from scipy.stats import norm
+ nrm = norm(0, 1)
+ pval = N.sqrt(self.va[:,0]) * nrm.ppf((1+level)/2)
+
+ # Compute reasonable min/max for the normal pdf
+ mc = 3
+ std = N.sqrt(self.va[:,0])
+ m = N.amin(self.mu[:, 0] - mc * std)
+ M = N.amax(self.mu[:, 0] + mc * std)
+
+ np = 500
+ x = N.linspace(m, M, np)
+ Yf = N.zeros(np)
+ Yt = N.zeros(np)
+
+ # Prepare the dic of plot handles to return
+ ks = ['pdf', 'conf', 'gpdf']
+ hp = dict((i,[]) for i in ks)
+ try:
+ import pylab as P
+ for c in range(self.k):
+ y = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
+ N.exp(-(x-self.mu[c][0])**2/(2*std[c]**2))
+ Yt += y
+ h = P.plot(x, y, 'r', label ='_nolegend_')
+ hp['pdf'].extend(h)
+ if fill:
+ #P.axvspan(-pval[c] + self.mu[c][0], pval[c] + self.mu[c][0],
+ # facecolor = 'b', alpha = 0.2)
+ id1 = -pval[c] + self.mu[c]
+ id2 = pval[c] + self.mu[c]
+ xc = x[:, N.where(x>id1)[0]]
+ xc = xc[:, N.where(xc<id2)[0]]
+ Yf = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
+ N.exp(-(xc-self.mu[c][0])**2/(2*std[c]**2))
+ xc = N.concatenate(([xc[0]], xc, [xc[-1]]))
+ Yf = N.concatenate(([0], Yf, [0]))
+ h = P.fill(xc, Yf,
+ facecolor = 'b', alpha = 0.1, label='_nolegend_')
+ hp['conf'].extend(h)
+ #P.fill([xc[0], xc[0], xc[-1], xc[-1]],
+ # [0, Yf[0], Yf[-1], 0], facecolor = 'b', alpha = 0.2)
+ if gpdf:
+ h = P.plot(x, Yt, 'r:', label='_nolegend_')
+ hp['gpdf'] = h
+ return hp
+ except ImportError:
+ raise GmParamError("matplotlib not found, cannot plot...")
+
+# Function to generate a random index: this is kept outside any class,
+# as the function can be useful for other
+def gen_rand_index(p, n):
+ """Generate a N samples vector containing random index between 1
+ and length(p), each index i with probability p(i)"""
+ # TODO Check args here
+
+ # TODO: check each value of inverse distribution is
+ # different
+ invcdf = N.cumsum(p)
+ uni = rand(n)
+ index = N.zeros(n, dtype=int)
+
+ # This one should be a bit faster
+ for k in range(len(p)-1, 0, -1):
+ blop = N.where(N.logical_and(invcdf[k-1] <= uni,
+ uni < invcdf[k]))
+ index[blop] = k
+
+ return index
+
+def check_gmm_param(w, mu, va):
+ """Check that w, mu and va are valid parameters for
+ a mixture of gaussian: w should sum to 1, there should
+ be the same number of component in each param, the variances
+ should be positive definite, etc...
+
+ Params:
+ w = vector or list of weigths of the mixture (K elements)
+ mu = matrix: K * d
+ va = list of variances (vector K * d or square matrices Kd * d)
+
+ returns:
+ K = number of components
+ d = dimension
+ mode = 'diag' if diagonal covariance, 'full' of full matrices
+ """
+
+ # Check that w is valid
+ if N.fabs(N.sum(w, 0) - 1) > MAX_DEV:
+ raise GmParamError('weight does not sum to 1')
+
+ if not len(w.shape) == 1:
+ raise GmParamError('weight is not a vector')
+
+ # Check that mean and va have the same number of components
+ K = len(w)
+
+ if N.ndim(mu) < 2:
+ msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
+ raise GmParamError(msg)
+ if N.ndim(va) < 2:
+ msg = """va should be a K,d / K *d, d matrix, and a row vector if
+ only 1 diag comp"""
+ raise GmParamError(msg)
+
+ (Km, d) = mu.shape
+ (Ka, da) = va.shape
+
+ if not K == Km:
+ msg = "not same number of component in mean and weights"
+ raise GmParamError(msg)
+
+ if not d == da:
+ msg = "not same number of dimensions in mean and variances"
+ raise GmParamError(msg)
+
+ if Km == Ka:
+ mode = 'diag'
+ else:
+ mode = 'full'
+ if not Ka == Km*d:
+ msg = "not same number of dimensions in mean and variances"
+ raise GmParamError(msg)
+
+ return K, d, mode
+
+if __name__ == '__main__':
+ # Meta parameters:
+ # - k = number of components
+ # - d = dimension
+ # - mode : mode of covariance matrices
+ d = 5
+ k = 4
+
+ # Now, drawing a model
+ mode = 'full'
+ nframes = 1e3
+
+ # Build a model with random parameters
+ w, mu, va = GM.gen_param(d, k, mode, spread = 3)
+ gm = GM.fromvalues(w, mu, va)
+
+ # Sample nframes frames from the model
+ X = gm.sample(nframes)
+
+ # Plot the data
+ import pylab as P
+ P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
+
+ # Real confidence ellipses with confidence level
+ level = 0.50
+ h = gm.plot(level=level)
+
+ # set the first ellipse label, which will appear in the legend
+ h[0].set_label('confidence ell at level ' + str(level))
+
+ P.legend(loc = 0)
+ P.show()
Added: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/gmm_em.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,394 @@
+# /usr/bin/python
+# Last Change: Fri Oct 06 05:00 PM 2006 J
+
+# TODO:
+# - which methods to avoid va shrinking to 0 ? There are several options,
+# not sure which ones are appropriates
+# - improve EM trainer
+# - online EM
+
+import numpy as N
+import numpy.linalg as lin
+from numpy.random import randn
+#import _c_densities as densities
+import densities
+from kmean import kmean
+from gauss_mix import GM
+
+# Error classes
+class GmmError(Exception):
+ """Base class for exceptions in this module."""
+ pass
+
+class GmmParamError:
+ """Exception raised for errors in gmm params
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error
+ """
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+# Not sure yet about how to design different mixture models. Most of the code
+# is different # (pdf, update part of EM, etc...) and I am not sure it makes
+# sense to use inheritance for # interface specification in python, since its
+# dynamic type systeme.
+
+# Anyway, a mixture class should encapsulates all details concerning a mixture model:
+# - internal parameters for the pdfs
+# - can compute sufficient statistics for EM
+# - can sample a model
+# - can generate random valid parameters for a new pdf (using class method)
+class MixtureModel:
+ pass
+
+class ExpMixtureModel(MixtureModel):
+ """Class to model mixture of exponential pdf (eg Gaussian, exponential, Laplace,
+ etc..). This is a special case because some parts of EM are common for those
+ models..."""
+ pass
+
+class GMM(ExpMixtureModel):
+ """ A class to model a Gaussian Mixture Model (GMM). An instance of
+ this class is created by giving weights, mean and variances in the ctor.
+ An instanciated object can be sampled, trained by EM.
+
+ The class method gen_model can be used without instanciation."""
+
+ def init_kmean(self, data, niter = 5):
+ """ Init the model with kmean."""
+ k = self.gm.k
+ d = self.gm.d
+ init = data[0:k, :]
+
+ (code, label) = kmean(data, init, niter)
+
+ w = N.ones(k) / k
+ mu = code.copy()
+ if self.gm.mode == 'diag':
+ va = N.zeros((k, d))
+ for i in range(k):
+ for j in range(d):
+ va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
+ elif self.gm.mode == 'full':
+ va = N.zeros((k*d, d))
+ for i in range(k):
+ va[i*d:i*d+d,:] = \
+ N.cov(data[N.where(label==i)], rowvar = 0)
+ else:
+ raise GmmParamError("mode " + str(mode) + " not recognized")
+
+ self.gm.set_param(w, mu, va)
+
+ def init_random(self, data):
+ """ Init the model at random."""
+ k = self.gm.k
+ d = self.gm.d
+ if mode == 'diag':
+ w = N.ones(k) / k
+ mu = randn(k, d)
+ va = N.fabs(randn(k, d))
+ else:
+ raise GmmParamError("""init_random not implemented for
+ mode %s yet""", mode)
+
+ self.gm.set_param(w, mu, va)
+
+ # TODO:
+ # - format of parameters ? For variances, list of variances matrix,
+ # keep the current format, have 3d matrices ?
+ # - To handle the different modes, we could do something "fancy" such as
+ # replacing methods, to avoid checking cases everywhere and unconsistency.
+ def __init__(self, gm, init = 'kmean'):
+ """ Initialize a GMM with weight w, mean mu and variances va, and initialization
+ method for training init (kmean by default)"""
+ self.gm = gm
+
+ # Possible init methods
+ init_methods = {'kmean': self.init_kmean, 'random' : self.init_random}
+
+ if init not in init_methods:
+ raise GmmParamError('init method %s not recognized' + str(init))
+
+ self.init = init_methods[init]
+
+ def sufficient_statistics(self, data):
+ """ Return normalized and non-normalized sufficient statistics
+ from the model.
+
+ Computes the latent variable distribution (a
+ posteriori probability) knowing the explicit data
+ for the Gaussian model (w, mu, var): gamma(t, i) =
+ P[state = i | observation = data(t); w, mu, va]
+
+ This is basically the E step of EM for GMM."""
+ n = data.shape[0]
+
+ # compute the gaussian pdf
+ tgd = multiple_gauss_den(data, self.gm.mu, self.gm.va)
+ # multiply by the weight
+ tgd *= self.gm.w
+ # Normalize to get a pdf
+ gd = tgd / N.sum(tgd, axis=1)[:, N.newaxis]
+
+ return gd, tgd
+
+ def update_em(self, data, gamma):
+ """Computes update of the Gaussian Mixture Model (M step)
+ from the a posteriori pdf, computed by gmm_posterior
+ (E step).
+ """
+ k = self.gm.k
+ d = self.gm.d
+ n = data.shape[0]
+ invn = 1.0/n
+ mGamma = N.sum(gamma, axis = 0)
+
+ if self.gm.mode == 'diag':
+ mu = N.zeros((k, d))
+ va = N.zeros((k, d))
+ gamma = gamma.T
+ for c in range(k):
+ x = N.dot(gamma[c:c+1,:], data)[0,:]
+ xx = N.dot(gamma[c:c+1,:], data ** 2)[0,:]
+
+ mu[c,:] = x / mGamma[c]
+ va[c,:] = xx / mGamma[c] - mu[c,:] ** 2
+ w = invn * mGamma
+
+ elif self.gm.mode == 'full':
+ # In full mode, this is the bottleneck: the triple loop
+ # kills performances. This is pretty straightforward
+ # algebra, so computing it in C should not be too difficult
+ mu = N.zeros((k, d))
+ va = N.zeros((k*d, d))
+
+ gamma = gamma.transpose()
+ for c in range(k):
+ #x = N.sum(N.outer(gamma[:, c],
+ # N.ones((1, d))) * data, axis = 0)
+ x = N.dot(gamma[c:c+1,:], data)[0,:]
+ xx = N.zeros((d, d))
+
+ # This should be much faster than recursing on n...
+ for i in range(d):
+ for j in range(d):
+ xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[c,:], axis = 0)
+
+ mu[c,:] = x / mGamma[c]
+ va[c*d:c*d+d,:] = xx / mGamma[c] - \
+ N.outer(mu[c,:], mu[c,:])
+ w = invn * mGamma
+ else:
+ raise GmmParamError("varmode not recognized")
+
+ self.gm.set_param(w, mu, va)
+
+class EM:
+ """An EM trainer. An EM trainer
+ trains from data, with a model
+
+ Not really useful yet"""
+ def __init__(self):
+ pass
+
+ def train(self, data, model, maxiter = 10, thresh = 1e-5):
+ """
+ Train a model using data, and stops when the likelihood fails
+ behind a threshold, or when the number of iterations > niter,
+ whichever comes first
+
+ Args:
+ - data: contains the observed features, one row is one frame, ie one
+ observation of dimension d
+ - model: object of class Mixture
+ - maxiter: maximum number of iterations
+
+ The model is trained, and its parameters updated accordingly.
+
+ Returns:
+ likelihood (one value per iteration).
+ """
+
+ # Initialize the data (may do nothing depending on the model)
+ model.init(data)
+
+ # Likelihood is kept
+ like = N.zeros(maxiter)
+
+ # Em computation, with computation of the likelihood
+ g, tgd = model.sufficient_statistics(data)
+ like[0] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+ model.update_em(data, g)
+ for i in range(1, maxiter):
+ g, tgd = model.sufficient_statistics(data)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+ model.update_em(data, g)
+ if has_em_converged(like[i], like[i-1], thresh):
+ return like[0:i]
+
+ return like
+
+# Misc functions
+def has_em_converged(like, plike, thresh):
+ """ given likelihood of current iteration like and previous
+ iteration plike, returns true is converged: based on comparison
+ of the slope of the likehood with thresh"""
+ diff = N.abs(like - plike)
+ avg = 0.5 * (N.abs(like) + N.abs(plike))
+ if diff / avg < thresh:
+ return True
+ else:
+ return False
+
+def multiple_gauss_den(data, mu, va):
+ """Helper function to generate several Gaussian
+ pdf (different parameters) from the same data"""
+ mu = N.atleast_2d(mu)
+ va = N.atleast_2d(va)
+
+ K = mu.shape[0]
+ n = data.shape[0]
+ d = data.shape[1]
+
+ y = N.zeros((K, n))
+ if mu.size == va.size:
+ for i in range(K):
+ y[i] = densities.gauss_den(data, mu[i, :], va[i, :])
+ return y.T
+ else:
+ for i in range(K):
+ y[:, i] = densities.gauss_den(data, mu[i, :],
+ va[d*i:d*i+d, :])
+
+ return y
+
+if __name__ == "__main__":
+ import copy
+ #=============================
+ # Simple GMM with 5 components
+ #=============================
+
+ #+++++++++++++++++++++++++++++
+ # Meta parameters of the model
+ # - k: Number of components
+ # - d: dimension of each Gaussian
+ # - mode: Mode of covariance matrix: full or diag
+ # - nframes: number of frames (frame = one data point = one
+ # row of d elements
+ k = 2
+ d = 1
+ mode = 'full'
+ nframes = 1e3
+
+ #+++++++++++++++++++++++++++++++++++++++++++
+ # Create an artificial GMM model, samples it
+ #+++++++++++++++++++++++++++++++++++++++++++
+ print "Generating the mixture"
+ # Generate a model with k components, d dimensions
+ w, mu, va = GM.gen_param(d, k, mode, spread = 3)
+ gm = GM(d, k, mode)
+ gm.set_param(w, mu, va)
+
+ # Sample nframes frames from the model
+ data = gm.sample(nframes)
+
+ #++++++++++++++++++++++++
+ # Learn the model with EM
+ #++++++++++++++++++++++++
+
+ # Init the model
+ print "Init a model for learning, with kmean for initialization"
+ lgm = GM(d, k, mode)
+ gmm = GMM(lgm, 'kmean')
+ gmm.init(data)
+
+ # Keep the initialized model for drawing
+ gm0 = copy.copy(lgm)
+
+ # The actual EM, with likelihood computation
+ niter = 10
+ like = N.zeros(niter)
+
+ print "computing..."
+ for i in range(niter):
+ g, tgd = gmm.sufficient_statistics(data)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+ gmm.update_em(data, g)
+ # # Alternative form, by using EM class: as the EM class
+ # # is quite rudimentary now, it is not very useful, just save
+ # # a few lines
+ # em = EM()
+ # like = em.train(data, gmm, niter)
+
+ #+++++++++++++++
+ # Draw the model
+ #+++++++++++++++
+ print "drawing..."
+ import pylab as P
+ P.subplot(2, 1, 1)
+
+ if not d == 1:
+ # Draw what is happening
+ P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
+
+ # Real confidence ellipses
+ Xre, Yre = gm.conf_ellipses()
+ P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
+ for i in range(1,k):
+ P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+
+ # Initial confidence ellipses as found by kmean
+ X0e, Y0e = gm0.conf_ellipses()
+ P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
+ for i in range(1,k):
+ P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
+
+ # Values found by EM
+ Xe, Ye = lgm.conf_ellipses()
+ P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
+ for i in range(1,k):
+ P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
+ P.legend(loc = 0)
+ else:
+ # Real confidence ellipses
+ h = gm.plot1d()
+ [i.set_color('g') for i in h['pdf']]
+ h['pdf'][0].set_label('true pdf')
+
+ # Initial confidence ellipses as found by kmean
+ h0 = gm0.plot1d()
+ [i.set_color('k') for i in h0['pdf']]
+ h0['pdf'][0].set_label('initial pdf')
+
+ # Values found by EM
+ hl = lgm.plot1d(fill = 1, level = 0.66)
+ [i.set_color('r') for i in hl['pdf']]
+ hl['pdf'][0].set_label('pdf found by EM')
+
+ P.legend(loc = 0)
+
+ P.subplot(2, 1, 2)
+ P.plot(like)
+ P.title('log likelihood')
+
+ # #++++++++++++++++++
+ # # Export the figure
+ # #++++++++++++++++++
+ # F = P.gcf()
+ # DPI = F.get_dpi()
+ # DefaultSize = F.get_size_inches()
+ # # the default is 100dpi for savefig:
+ # F.savefig("example1.png")
+
+ # # Now make the image twice as big, while keeping the fonts and all the
+ # # same size
+ # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) )
+ # Size = F.get_size_inches()
+ # print "Size in Inches", Size
+ # F.savefig("example2.png")
+ P.show()
Added: trunk/Lib/sandbox/pyem/info.py
===================================================================
--- trunk/Lib/sandbox/pyem/info.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/info.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,35 @@
+"""
+Routines for Gaussian Mixture Models
+and learning with Expectation Maximization
+==========================================
+
+This module contains classes and function to compute multivariate Gaussian densities
+(diagonal and full covariance matrices), Gaussian mixtures, Gaussian mixtures models
+and an Em trainer.
+
+More specifically, the module contains the following file:
+
+- densities.py: functions to compute multivariate Gaussian pdf and ellipsoides of
+confidence (gauss_den)
+- gauss_mix.py: defines the GM (Gaussian Mixture) class. A Gaussian Mixture can be
+created from its parameters weights, mean and variances, or from its meta parameters
+d (dimension of the Gaussian) and k (number of components in the mixture). A Gaussian
+Model can then be sampled or plot (if d>1, plot confidence ellipsoids projected on
+2 chosen dimensions, if d == 1, plot the pdf of each component and fill the zone
+of confidence for a given level)
+- gmm_em.py: defines a class GMM (Gaussian Mixture Model). This class is constructed
+from a GM model gm, and can be used to train gm. The GMM can be initiated by
+kmean or at random, and can compute sufficient statistics, and update its parameters
+from the sufficient statistics.
+- kmean.py: implements a kmean algorithm. We cannot use scipy.cluster.vq kmeans, since
+its does not give membership of observations.
+
+Example of use: simply execute gmm_em.py for an example of training a GM and plotting
+the results compared to true values
+
+Copyright: David Cournapeau 2006
+License: BSD-style (see LICENSE.txt in main source directory)
+"""
+
+depends = ['linalg', 'stats']
+ignore = False
Added: trunk/Lib/sandbox/pyem/kmean.py
===================================================================
--- trunk/Lib/sandbox/pyem/kmean.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/kmean.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,76 @@
+# /usr/bin/python
+# Last Change: Thu Sep 28 01:00 PM 2006 J
+
+#TODO:
+# - a demo for kmeans
+
+import numpy as N
+
+def _py_vq(data, code):
+ """ Please do not use directly. Use kmean instead"""
+ # No attempt to be efficient has been made...
+ (n, d) = data.shape
+ (k, d) = code.shape
+
+ label = N.zeros(n, int)
+ for i in range(n):
+ d = N.sum((data[i, :] - code) ** 2, 1)
+ label[i] = N.argmin(d)
+
+ return label
+
+# Try to import pyrex function for vector quantization. If not available,
+# falls back on pure python implementation.
+#%KMEANIMPORT%
+#try:
+# from scipy.cluster.vq import kmeans as kmean
+#except ImportError:
+# try:
+# from c_gmm import _vq
+# except:
+# print """c_gmm._vq not found, using pure python implementation instead.
+# Kmean will be REALLY slow"""
+# _vq = _py_vq
+try:
+ from scipy.cluster.vq import vq
+ print "using scipy.cluster.vq"
+ def _vq(*args, **kw): return vq(*args, **kw)[0]
+except ImportError:
+ try:
+ from c_gmm import _vq
+ print "using pyrex vq"
+ except ImportError:
+ print """c_gmm._vq not found, using pure python implementation instead.
+ Kmean will be REALLY slow"""
+ _vq = _py_vq
+
+def kmean(data, init, iter = 10):
+ """Simple kmean implementation for EM. Runs iter iterations.
+
+ returns a tuple (code, label), where code are the final
+ centroids, and label are the class label indec for each
+ frame (ie row) of data"""
+
+ data = N.atleast_2d(data)
+ init = N.atleast_2d(init)
+
+ (n, d) = data.shape
+ (k, d1) = init.shape
+
+ if not d == d1:
+ msg = "data and init centers do not have same dimensions..."
+ raise GmmParamError(msg)
+
+ code = N.asarray(init.copy())
+ for i in range(iter):
+ # Compute the nearest neighbour for each obs
+ # using the current code book
+ label = _vq(data, code)
+ # Update the code by computing centroids using the new code book
+ for j in range(k):
+ code[j,:] = N.mean(data[N.where(label==j)], axis=0)
+
+ return code, label
+
+if __name__ == "__main__":
+ pass
Added: trunk/Lib/sandbox/pyem/online_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/online_em.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/online_em.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,369 @@
+# /usr/bin/python
+# Last Change: Thu Oct 12 09:00 PM 2006 J
+
+#---------------------------------------------
+# This is not meant to be used yet !!!! I am
+# not sure how to integrate this stuff inside
+# the package yet. The cases are:
+# - we have a set of data, and we want to test online EM
+# compared to normal EM
+# - we do not have all the data before putting them in online EM:
+# eg current frame depends on previous frame in some way.
+
+import numpy as N
+from numpy import repmat, mean
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+from gmm_em import ExpMixtureModel, GMM, EM, multiple_gauss_den
+from gauss_mix import GM
+from kmean import kmean
+
+print "======================================================"
+import traceback
+f = traceback.extract_stack()
+print f[0][0] + " This is beta code, don't use it ! "
+print "======================================================"
+
+# Error classes
+class OnGmmError(Exception):
+ """Base class for exceptions in this module."""
+ pass
+
+class OnGmmParamError:
+ """Exception raised for errors in gmm params
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error
+ """
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+class OnGMM(ExpMixtureModel):
+ def init_kmean(self, init_data, niter = 5):
+ """ Init the model at random."""
+ k = self.gm.k
+ d = self.gm.d
+ if mode == 'diag':
+ w = N.ones(k) / k
+
+ # Init the internal state of EM
+ self.cx = N.outer(w, mean(init_data, 0))
+ self.cxx = N.outer(w, mean(init_data ** 2, 0))
+
+ # w, mu and va init is the same that in the standard case
+ (code, label) = kmean(init_data, init_data[0:k, :], niter)
+ mu = code.copy()
+ va = N.zeros((k, d))
+ for i in range(k):
+ for j in range(d):
+ va [i,j] = N.cov(init_data[N.where(label==i), j], rowvar = 0)
+ else:
+ raise OnGmmParamError("""init_online not implemented for
+ mode %s yet""", mode)
+
+ self.gm.set_param(w, mu, va)
+ self.cw = w
+ self.cmu = mu
+ self.cva = va
+
+ self.pw = self.cw.copy()
+ self.pmu = self.cmu.copy()
+ self.pva = self.cva.copy()
+
+ def __init__(self, gm, init_data, init = 'kmean'):
+ self.gm = gm
+
+ # Possible init methods
+ init_methods = {'kmean' : self.init_kmean}
+
+ self.init = init_methods[init]
+
+ def sufficient_statistics(self, frame, nu):
+ """ sufficient_statistics(frame, nu)
+
+ frame has to be rank 2 !"""
+ gamma = multiple_gauss_den(frame, self.pmu, self.pva)[0]
+ gamma *= self.pw
+ gamma /= N.sum(gamma)
+ # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
+ self.cw = (1 - nu) * self.cw + nu * gamma
+
+ return gamma
+
+ def update_em(self, frame, gamma, nu):
+ cx = self.cx
+ cxx = self.cxx
+ cmu = self.cmu
+ cva = self.cva
+ for k in range(self.gm.k):
+ cx[k] = (1 - nu) * cx[k] + nu * frame * gamma[k]
+ cxx[k] = (1 - nu) * cxx[k] + nu * frame ** 2 * gamma[k]
+
+ cmu[k] = cx[k] / cw[k]
+ cva[k] = cxx[k] / cw[k] - cmu[k] ** 2
+
+if __name__ == "__main__":
+ import copy
+ #=============================
+ # Simple GMM with 2 components
+ #=============================
+
+ #+++++++++++++++++++++++++++++
+ # Meta parameters of the model
+ # - k: Number of components
+ # - d: dimension of each Gaussian
+ # - mode: Mode of covariance matrix: full or diag
+ # - nframes: number of frames (frame = one data point = one
+ # row of d elements
+ k = 2
+ d = 1
+ mode = 'diag'
+ nframes = int(1e3)
+ emiter = 2
+
+ #+++++++++++++++++++++++++++++++++++++++++++
+ # Create an artificial GMM model, samples it
+ #+++++++++++++++++++++++++++++++++++++++++++
+ print "Generating the mixture"
+ # Generate a model with k components, d dimensions
+ w, mu, va = GM.gen_param(d, k, mode, spread = 1.5)
+ gm = GM.fromvalues(w, mu, va)
+
+ # Sample nframes frames from the model
+ data = gm.sample(nframes, )
+
+ #++++++++++++++++++++++++
+ # Learn the model with EM
+ #++++++++++++++++++++++++
+
+ # Init the model
+ print "Init a model for learning, with kmean for initialization"
+ lgm = GM(d, k, mode)
+ gmm = GMM(lgm, 'kmean')
+ gmm.init(data)
+
+ # Keep the initialized model for drawing
+ gm0 = copy.copy(lgm)
+
+ # The actual EM, with likelihood computation
+ like = N.zeros(emiter)
+ print "computing..."
+ for i in range(emiter):
+ g, tgd = gmm.sufficient_statistics(data)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+ gmm.update_em(data, g)
+
+ #+++++++++++++++++++++++++++++++
+ # Learn the model with Online EM
+ #+++++++++++++++++++++++++++++++
+ ogm = GM(d, k, mode)
+ ogmm = OnGMM(ogm, 'kmean')
+ #init_data = data[0:nframes / 20, :]
+ init_data = data
+ ogmm.init(init_data)
+
+ ogm2 = GM(d, k, mode)
+ ogmm2 = OnGMM(ogm2, 'kmean')
+ #init_data = data[0:nframes / 20, :]
+ init_data = data
+ ogmm2.init(init_data)
+
+ ogm0 = copy.copy(ogm)
+ assert_array_equal(ogm0.w, gm0.w)
+ assert_array_equal(ogm0.mu, gm0.mu)
+ assert_array_equal(ogm0.va, gm0.va)
+
+ ogm02 = copy.copy(ogm2)
+ assert_array_equal(ogm02.w, gm0.w)
+ assert_array_equal(ogm02.mu, gm0.mu)
+ assert_array_equal(ogm02.va, gm0.va)
+
+ w0 = gm0.w.copy()
+ mu0 = gm0.mu.copy()
+ va0 = gm0.va.copy()
+
+ cx = ogmm.cx
+ cxx = ogmm.cxx
+
+ cw = w0.copy()
+ cmu = mu0.copy()
+ cva = va0.copy()
+
+ pw = cw.copy()
+ pmu = cmu.copy()
+ pva = cva.copy()
+
+ # Forgetting param
+ ku = 0.005
+ t0 = 200
+ lamb = N.ones((nframes, 1))
+ lamb[0] = 0
+ nu0 = 1.0
+ #lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0)
+ #nu0 = 0.2
+ nu = N.zeros((len(lamb), 1))
+ nu[0] = nu0
+ for i in range(1, len(lamb)):
+ nu[i] = 1./(1 + lamb[i] / nu[i-1])
+
+ gamma = N.zeros((nframes, k))
+ agamma = []
+ apw = []
+ apmu = []
+ apva = []
+ print "========== Online Manual ==========="
+ for e in range(emiter):
+ print "online manual: estep %d, printing p* state " % e
+ apw.append(pw.copy())
+ apmu.append(pmu.copy())
+ apva.append(pva.copy())
+ for t in range(nframes):
+ gamma[t] = multiple_gauss_den(data[t:t+1, :], pmu, pva)[0]
+ gamma[t] *= pw
+ gamma[t] /= N.sum(gamma[t])
+ # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
+ # <x>(t) = cx(t)
+ cw = (1 - nu[t]) * cw + nu[t] * gamma[t]
+ # loop through each component
+ for i in range(k):
+ cx[i] = (1 - nu[t]) * cx[i] + nu[t] * data[t, :] * gamma[t, i]
+ cxx[i] = (1 - nu[t]) * cxx[i] + nu[t] * data[t, :] ** 2 * gamma[t, i]
+
+ cmu[i] = cx[i] / cw[i]
+ cva[i] = cxx[i] / cw[i] - cmu[i] ** 2
+
+ #pw = cw.copy()
+ #pmu = cmu.copy()
+ #pva = cva.copy()
+ print "gamma[end]: " + str(gamma[-1])
+ pw = cw.copy()
+ pmu = cmu.copy()
+ pva = cva.copy()
+ agamma.append(gamma.copy())
+
+ gamma2 = N.zeros((nframes, k))
+ agamma2 = []
+ apw2 = []
+ apmu2 = []
+ apva2 = []
+ print "========== Online Automatic ==========="
+ for e in range(emiter):
+ print "online automatic: estep %d, printing p* state " % e
+ apw2.append(ogmm2.pw.copy())
+ apmu2.append(ogmm2.pmu.copy())
+ apva2.append(ogmm2.pva.copy())
+ for t in range(nframes):
+ gamma2[t] = ogmm2.sufficient_statistics(data[t:t+1, :], nu[t])
+ #gamma2[t] = multiple_gauss_den(data[t:t+1, :], ogmm2.pmu, ogmm2.pva)[0]
+ #gamma2[t] *= ogmm2.pw
+ #gamma2[t] /= N.sum(gamma2[t])
+ #try:
+ # assert_array_equal(agamma, gamma2[t])
+ #except AssertionError:
+ # print "error for estep %d, step %d" % (e, t)
+ # print ogmm2.pw
+ # print ogmm2.pmu
+ # print ogmm2.pva
+ # raise
+ ogmm2.update_em(data[t, :], gamma2[t], nu[t])
+ #ogmm2.cw = (1 - nu[t]) * ogmm2.cw + nu[t] * agamma
+ ## loop through each component
+ #for i in range(k):
+ # ogmm2.cx[i] = (1 - nu[t]) * ogmm2.cx[i] + nu[t] * data[t, :] * agamma[i]
+ # ogmm2.cxx[i] = (1 - nu[t]) * ogmm2.cxx[i] + nu[t] * data[t, :] ** 2 * agamma[i]
+
+ # ogmm2.cmu[i] = ogmm2.cx[i] / ogmm2.cw[i]
+ # ogmm2.cva[i] = ogmm2.cxx[i] / ogmm2.cw[i] - ogmm2.cmu[i] ** 2
+
+ print "gamma[end]: " + str(gamma2[-1])
+ agamma2.append(gamma2.copy())
+ ogmm2.pw = ogmm2.cw.copy()
+ ogmm2.pmu = ogmm2.cmu.copy()
+ ogmm2.pva = ogmm2.cva.copy()
+
+ # #ogm.set_param(pw, pmu, pva)
+ # print "weights off vs on: \n" + str(lgm.w) + "\n " + str(cw)
+ # print "mean off vs on: \n" + str(lgm.mu) + "\n " + str(cmu)
+ # print "variances off vs on: \n" + str(lgm.va) + "\n " + str(cva)
+ # print "weights off vs on2: \n" + str(lgm.w) + "\n " + str(ogmm2.cw)
+ # print "mean off vs on2: \n" + str(lgm.mu) + "\n " + str(ogmm2.cmu)
+ # print "variances off vs on2: \n" + str(lgm.va) + "\n " + str(ogmm2.cva)
+ # assert_array_almost_equal(cw, lgm.w)
+ # assert_array_almost_equal(cmu, lgm.mu)
+ # assert_array_almost_equal(cva, lgm.va)
+ assert_array_equal(ogmm2.pw, pw)
+ assert_array_equal(ogmm2.pmu, pmu)
+ assert_array_equal(ogmm2.pva, pva)
+ assert_array_equal(agamma[0], agamma2[0])
+ #assert_array_almost_equal(ogmm2.cw, lgm.w)
+ #assert_array_almost_equal(ogmm2.cmu, lgm.mu)
+ #assert_array_almost_equal(ogmm2.cva, lgm.va)
+ # #+++++++++++++++
+ # # Draw the model
+ # #+++++++++++++++
+ # print "drawing..."
+ # import pylab as P
+ # P.subplot(2, 1, 1)
+
+ # if d == 1:
+ # # Real confidence ellipses
+ # h = gm.plot1d()
+ # [i.set_color('g') for i in h['pdf']]
+ # h['pdf'][0].set_label('true pdf')
+
+ # # Initial confidence ellipses as found by kmean
+ # h0 = gm0.plot1d()
+ # [i.set_color('k') for i in h0['pdf']]
+ # h0['pdf'][0].set_label('initial pdf')
+
+ # # Values found by EM
+ # hl = lgm.plot1d(fill = 1, level = 0.66)
+ # [i.set_color('r') for i in hl['pdf']]
+ # hl['pdf'][0].set_label('pdf found by EM')
+
+ # # Values found by Online EM
+ # ho = ogm.plot1d(fill = 1, level = 0.66)
+ # [i.set_color('b') for i in ho['pdf']]
+ # ho['pdf'][0].set_label('pdf found by online EM')
+
+ # P.legend(loc = 0)
+ # else:
+ # # Draw what is happening
+ # P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
+
+ # # Real confidence ellipses
+ # Xre, Yre = gm.conf_ellipses()
+ # P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
+ # for i in range(1,k):
+ # P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+
+ # # Initial confidence ellipses as found by kmean
+ # X0e, Y0e = gm0.conf_ellipses()
+ # P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
+ # for i in range(1,k):
+ # P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
+
+ # # Values found by EM
+ # Xe, Ye = lgm.conf_ellipses()
+ # P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
+ # for i in range(1,k):
+ # P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
+ # P.legend(loc = 0)
+
+ # # Values found by Online EM
+ # Xe, Ye = ogm.conf_ellipses()
+ # P.plot(Xe[0], Ye[0], 'm', label = 'confidence ellipsoides found by Online EM')
+ # for i in range(1,k):
+ # P.plot(Xe[i], Ye[i], 'm', label = '_nolegend_')
+ # P.legend(loc = 0)
+
+
+ # P.subplot(2, 1, 2)
+ # P.plot(like)
+ # P.title('log likelihood')
+
+ # P.show()
Added: trunk/Lib/sandbox/pyem/profile_data/profile_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/profile_data/profile_densities.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/profile_data/profile_densities.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,54 @@
+import numpy as N
+from numpy.random import randn
+from pyem import densities as D
+from pyem import _c_densities as DC
+#import tables
+
+def bench(func, mode = 'diag'):
+ #===========================================
+ # Diag Gaussian of dimension 20
+ #===========================================
+ d = 30
+ n = 1e5
+ niter = 10
+
+ print "Compute %d times densities, %d dimension, %d frames" % (niter, d, n)
+ # Generate a model with k components, d dimensions
+ mu = randn(1, d)
+ if mode == 'diag':
+ va = abs(randn(1, d))
+ elif mode == 'full':
+ va = randn(d, d)
+ va = N.dot(va, va.transpose())
+
+ X = randn(n, d)
+ for i in range(niter):
+ Y = func(X, mu, va)
+
+def benchpy():
+ bench(D.gauss_den)
+
+def benchc():
+ bench(DC.gauss_den)
+
+def benchpyfull():
+ bench(D.gauss_den, 'full')
+
+def benchcfull():
+ bench(DC.gauss_den, 'full')
+
+if __name__ == "__main__":
+ import hotshot, hotshot.stats
+ profile_file = 'gdenpy.prof'
+ prof = hotshot.Profile(profile_file, lineevents=1)
+ prof.runcall(benchpy)
+ p = hotshot.stats.load(profile_file)
+ print p.sort_stats('cumulative').print_stats(20)
+ prof.close()
+
+ profile_file = 'gdenc.prof'
+ prof = hotshot.Profile(profile_file, lineevents=1)
+ prof.runcall(benchc)
+ p = hotshot.stats.load(profile_file)
+ print p.sort_stats('cumulative').print_stats(20)
+ prof.close()
Added: trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py
===================================================================
--- trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,70 @@
+import numpy as N
+from pyem import GM, GMM
+import copy
+
+from pyem._c_densities import gauss_den
+
+def bench1(mode = 'diag'):
+ #===========================================
+ # GMM of 20 comp, 20 dimension, 1e4 frames
+ #===========================================
+ d = 15
+ k = 30
+ nframes = 1e4
+ niter = 10
+ mode = 'diag'
+
+ print "============================================================="
+ print "(%d dim, %d components) GMM with %d iterations, for %d frames" \
+ % (d, k, niter, nframes)
+
+ #+++++++++++++++++++++++++++++++++++++++++++
+ # Create an artificial GMM model, samples it
+ #+++++++++++++++++++++++++++++++++++++++++++
+ print "Generating the mixture"
+ # Generate a model with k components, d dimensions
+ w, mu, va = GM.gen_param(d, k, mode, spread = 3)
+ # gm = GM(d, k, mode)
+ # gm.set_param(w, mu, va)
+ gm = GM.fromvalues(w, mu, va)
+
+ # Sample nframes frames from the model
+ data = gm.sample(nframes)
+
+ #++++++++++++++++++++++++
+ # Learn the model with EM
+ #++++++++++++++++++++++++
+
+ # Init the model
+ print "Init a model for learning, with kmean for initialization"
+ lgm = GM(d, k, mode)
+ gmm = GMM(lgm, 'kmean')
+
+ gmm.init(data)
+ # Keep the initialized model for drawing
+ gm0 = copy.copy(lgm)
+
+ # The actual EM, with likelihood computation
+ like = N.zeros(niter)
+
+ print "computing..."
+ for i in range(niter):
+ print "iteration %d" % i
+ g, tgd = gmm.sufficient_statistics(data)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)))
+ gmm.update_em(data, g)
+
+if __name__ == "__main__":
+ import hotshot, hotshot.stats
+ profile_file = 'gmm.prof'
+ prof = hotshot.Profile(profile_file, lineevents=1)
+ prof.runcall(bench1)
+ p = hotshot.stats.load(profile_file)
+ print p.sort_stats('cumulative').print_stats(20)
+ prof.close()
+ # import profile
+ # profile.run('bench1()', 'gmmprof')
+ # import pstats
+ # p = pstats.Stats('gmmprof')
+ # print p.sort_stats('cumulative').print_stats(20)
+
Deleted: trunk/Lib/sandbox/pyem/pyem/__init__.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/__init__.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/__init__.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,12 +0,0 @@
-#! /usr/bin/env python
-# Last Change: Tue Oct 03 06:00 PM 2006 J
-
-from info import __doc__
-
-from numpy.testing import NumpyTest
-test = NumpyTest().test
-
-version = '0.5.3'
-
-from gauss_mix import GmParamError, GM
-from gmm_em import GmmParamError, GMM, EM
Deleted: trunk/Lib/sandbox/pyem/pyem/_c_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/_c_densities.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/_c_densities.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,217 +0,0 @@
-#! /usr/bin/python
-#
-# Copyrighted David Cournapeau
-# Last Change: Tue Oct 03 05:00 PM 2006 J
-
-# This module uses a C implementation through ctypes, for diagonal cases
-# TODO:
-# - portable way to find/open the shared library
-# - full cov matrice
-
-import numpy as N
-import numpy.linalg as lin
-from numpy.random import randn
-from scipy.stats import chi2
-import densities as D
-
-import ctypes
-from ctypes import cdll, c_uint, c_int, c_double, POINTER
-from numpy.ctypeslib import ndpointer, load_library
-
-ctypes_major = int(ctypes.__version__.split('.')[0])
-if ctypes_major < 1:
- msg = "version of ctypes is %s, expected at least %s" \
- % (ctypes.__version__, '1.0.0')
- raise Exception(msg)
-
-# Requirements for diag gden
-_gden = load_library('c_gden.so', __file__)
-arg1 = ndpointer(dtype='<f8')
-arg2 = c_uint
-arg3 = c_uint
-arg4 = ndpointer(dtype='<f8')
-arg5 = ndpointer(dtype='<f8')
-arg6 = ndpointer(dtype='<f8')
-_gden.gden_diag.argtypes = [arg1, arg2, arg3, arg4, arg5, arg6]
-_gden.gden_diag.restype = c_int
-
-# Error classes
-class DenError(Exception):
- """Base class for exceptions in this module.
-
- Attributes:
- expression -- input expression in which the error occurred
- message -- explanation of the error"""
- def __init__(self, message):
- self.message = message
-
- def __str__(self):
- return self.message
-
-# The following function do all the fancy stuff to check that parameters
-# are Ok, and call the right implementation if args are OK.
-def gauss_den(x, mu, va, log = False):
- """ Compute multivariate Gaussian density at points x for
- mean mu and variance va.
-
- Vector are row vectors, except va which can be a matrix
- (row vector variance for diagonal variance)
-
- If log is True, than the log density is returned
- (useful for underflow ?)"""
- mu = N.atleast_2d(mu)
- va = N.atleast_2d(va)
- x = N.atleast_2d(x)
-
- #=======================#
- # Checking parameters #
- #=======================#
- if len(N.shape(mu)) != 2:
- raise DenError("mu is not rank 2")
-
- if len(N.shape(va)) != 2:
- raise DenError("va is not rank 2")
-
- if len(N.shape(x)) != 2:
- raise DenError("x is not rank 2")
-
- (n, d) = x.shape
- (dm0, dm1) = mu.shape
- (dv0, dv1) = va.shape
-
- # Check x and mu same dimension
- if dm0 != 1:
- msg = "mean must be a row vector!"
- raise DenError(msg)
- if dm1 != d:
- msg = "x and mu not same dim"
- raise DenError(msg)
- # Check va and mu same size
- if dv1 != d:
- msg = "mu and va not same dim"
- raise DenError(msg)
- if dv0 != 1 and dv0 != d:
- msg = "va not square"
- raise DenError(msg)
-
- #===============#
- # Computation #
- #===============#
- if d == 1:
- # scalar case
- return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log)
- elif dv0 == 1:
- # Diagonal matrix case
- return _diag_gauss_den(x, mu, va, log)
- elif dv1 == dv0:
- # full case
- return _full_gauss_den(x, mu, va, log)
- else:
- raise DenError("variance mode not recognized, this is a bug")
-
-# Those 3 functions do almost all the actual computation
-def _scalar_gauss_den(x, mu, va, log):
- """ This function is the actual implementation
- of gaussian pdf in scalar case. It assumes all args
- are conformant, so it should not be used directly
-
- ** Expect centered data (ie with mean removed) **
-
- Call gauss_den instead"""
- d = mu.size
- inva = 1/va
- fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
- y = ((x-mu) ** 2) * -0.5 * inva
- if not log:
- y = fac * N.exp(y)
- else:
- y = y + log(fac)
-
- return y
-
-def _diag_gauss_den(x, mu, va, log):
- """ This function is the actual implementation
- of gaussian pdf in scalar case. It assumes all args
- are conformant, so it should not be used directly
-
- ** Expect centered data (ie with mean removed) **
-
- Call gauss_den instead"""
- # Diagonal matrix case
- d = mu.size
- n = x.shape[0]
- if not log:
- y = N.zeros(n)
- vat = va.copy()
- # _gden.gden_diag(N.require(x, requirements = 'C'), n, d,
- # N.require(mu, requirements = 'C'),
- # N.require(inva, requirements = 'C'),
- # N.require(y, requirements = 'C'))
- x = N.require(x, requirements = 'C')
- mu = N.require(mu, requirements = 'C')
- vat = N.require(vat, requirements = 'C')
- y = N.require(y, requirements = 'C')
- _gden.gden_diag(x, n, d, mu, vat, y)
- return y
- # _gden.gden_diag.restype = c_int
- # _gden.gden_diag.argtypes = [POINTER(c_double), c_uint, c_uint,
- # POINTER(c_double), POINTER(c_double), POINTER(c_double)]
-
- # y = N.zeros(n)
- # inva= 1/va
- # _gden.gden_diag(x.ctypes.data_as(POINTER(c_double)),
- # n, d,
- # mu.ctypes.data_as(POINTER(c_double)),
- # inva.ctypes.data_as(POINTER(c_double)),
- # y.ctypes.data_as(POINTER(c_double)))
- else:
- y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
- for i in range(1, d):
- y += _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
- return y
-
-def _full_gauss_den(x, mu, va, log):
- """ This function is the actual implementation
- of gaussian pdf in full matrix case.
-
- It assumes all args are conformant, so it should
- not be used directly Call gauss_den instead
-
- ** Expect centered data (ie with mean removed) **
-
- Does not check if va is definite positive (on inversible
- for that matter), so the inverse computation and/or determinant
- would throw an exception."""
- d = mu.size
- inva = lin.inv(va)
- fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
-
- # we are using a trick with sum to "emulate"
- # the matrix multiplication inva * x without any explicit loop
- y = N.dot((x-mu), inva)
- y = -0.5 * N.sum(y * (x-mu), 1)
-
- if not log:
- y = fac * N.exp(y)
- else:
- y = y + N.log(fac)
-
- return y
-
-if __name__ == "__main__":
- #=========================================
- # Test accuracy between pure and C python
- #=========================================
- mu = N.array([2.0, 3])
- va = N.array([5.0, 3])
-
- # Generate a multivariate gaussian of mean mu and covariance va
- nframes = 1e4
- X = randn(nframes, 2)
- Yc = N.dot(N.diag(N.sqrt(va)), X.transpose())
- Yc = Yc.transpose() + mu
-
- Y = D.gauss_den(Yc, mu, va)
- Yt = gauss_den(Yc, mu, va)
-
- print "Diff is " + str(N.sqrt(N.sum((Y-Yt) ** 2))/nframes/2)
Deleted: trunk/Lib/sandbox/pyem/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/densities.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/densities.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,257 +0,0 @@
-#! /usr/bin/python
-#
-# Copyrighted David Cournapeau
-# Last Change: Thu Oct 05 07:00 PM 2006 J
-
-import numpy as N
-import numpy.linalg as lin
-from numpy.random import randn
-from scipy.stats import chi2
-
-# Error classes
-class DenError(Exception):
- """Base class for exceptions in this module.
-
- Attributes:
- expression -- input expression in which the error occurred
- message -- explanation of the error"""
- def __init__(self, message):
- self.message = message
-
- def __str__(self):
- return self.message
-
-# The following function do all the fancy stuff to check that parameters
-# are Ok, and call the right implementation if args are OK.
-def gauss_den(x, mu, va, log = False):
- """ Compute multivariate Gaussian density at points x for
- mean mu and variance va.
-
- Vector are row vectors, except va which can be a matrix
- (row vector variance for diagonal variance)
-
- If log is True, than the log density is returned
- (useful for underflow ?)"""
- mu = N.atleast_2d(mu)
- va = N.atleast_2d(va)
- x = N.atleast_2d(x)
-
- #=======================#
- # Checking parameters #
- #=======================#
- if len(N.shape(mu)) != 2:
- raise DenError("mu is not rank 2")
-
- if len(N.shape(va)) != 2:
- raise DenError("va is not rank 2")
-
- if len(N.shape(x)) != 2:
- raise DenError("x is not rank 2")
-
- (n, d) = x.shape
- (dm0, dm1) = mu.shape
- (dv0, dv1) = va.shape
-
- # Check x and mu same dimension
- if dm0 != 1:
- msg = "mean must be a row vector!"
- raise DenError(msg)
- if dm1 != d:
- msg = "x and mu not same dim"
- raise DenError(msg)
- # Check va and mu same size
- if dv1 != d:
- msg = "mu and va not same dim"
- raise DenError(msg)
- if dv0 != 1 and dv0 != d:
- msg = "va not square"
- raise DenError(msg)
-
- #===============#
- # Computation #
- #===============#
- if d == 1:
- # scalar case
- return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log)
- elif dv0 == 1:
- # Diagonal matrix case
- return _diag_gauss_den(x, mu, va, log)
- elif dv1 == dv0:
- # full case
- return _full_gauss_den(x, mu, va, log)
- else:
- raise DenError("variance mode not recognized, this is a bug")
-
-# Those 3 functions do almost all the actual computation
-def _scalar_gauss_den(x, mu, va, log):
- """ This function is the actual implementation
- of gaussian pdf in scalar case. It assumes all args
- are conformant, so it should not be used directly
-
- Call gauss_den instead"""
- d = mu.size
- inva = 1/va
- fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
- y = ((x-mu) ** 2) * -0.5 * inva
- if not log:
- y = fac * N.exp(y)
- else:
- y = y + log(fac)
-
- return y
-
-#from ctypes import cdll, c_uint, c_int, c_double, POINTER
-#_gden = cdll.LoadLibrary('src/libgden.so')
-#_gden.gden_diag.restype = c_int
-#_gden.gden_diag.argtypes = [POINTER(c_double), c_uint, c_uint,
-# POINTER(c_double), POINTER(c_double), POINTER(c_double)]
-
-def _diag_gauss_den(x, mu, va, log):
- """ This function is the actual implementation
- of gaussian pdf in scalar case. It assumes all args
- are conformant, so it should not be used directly
-
- Call gauss_den instead"""
- # Diagonal matrix case
- d = mu.size
- n = x.shape[0]
- if not log:
- inva = 1/va[0,0]
- fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
- y = (x[:,0] - mu[0,0]) ** 2 * inva * -0.5
- for i in range(1, d):
- inva = 1/va[0,i]
- fac *= N.sqrt(inva)
- y += (x[:,i] - mu[0,i]) ** 2 * inva * -0.5
- y = fac * N.exp(y)
- else:
- y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
- for i in range(1, d):
- y += _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
- return y
-
-def _full_gauss_den(x, mu, va, log):
- """ This function is the actual implementation
- of gaussian pdf in full matrix case.
-
- It assumes all args are conformant, so it should
- not be used directly Call gauss_den instead
-
- Does not check if va is definite positive (on inversible
- for that matter), so the inverse computation and/or determinant
- would throw an exception."""
- d = mu.size
- inva = lin.inv(va)
- fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
-
- # # Slow version
- # n = N.size(x, 0)
- # y = N.zeros(n)
- # for i in range(n):
- # y[i] = N.dot(x[i,:],
- # N.dot(inva, N.transpose(x[i,:])))
- # y *= -0.5
-
- # we are using a trick with sum to "emulate"
- # the matrix multiplication inva * x without any explicit loop
- y = N.dot((x-mu), inva)
- y = -0.5 * N.sum(y * (x-mu), 1)
-
- if not log:
- y = fac * N.exp(y)
- else:
- y = y + N.log(fac)
-
- return y
-
-# To plot a confidence ellipse from multi-variate gaussian pdf
-def gauss_ell(mu, va, dim = [0, 1], npoints = 100, level = 0.39):
- """ Given a mean and covariance for multi-variate
- gaussian, returns npoints points for the ellipse
- of confidence given by level (all points will be inside
- the ellipsoides with a probability equal to level)
-
- Returns the coordinate x and y of the ellipse"""
-
- mu = N.atleast_1d(mu)
- va = N.atleast_1d(va)
- c = N.array(dim)
-
- if mu.size == va.size:
- mode = 'diag'
- else:
- if va.ndim == 2:
- if va.shape[0] == va.shape[1]:
- mode = 'full'
- else:
- raise DenError("variance not square")
- else:
- raise DenError("mean and variance are not dim conformant")
-
- chi22d = chi2(2)
- mahal = N.sqrt(chi22d.ppf(level))
-
- # Generates a circle of npoints
- theta = N.linspace(0, 2 * N.pi, npoints)
- circle = mahal * N.array([N.cos(theta), N.sin(theta)])
-
- # Get the dimension which we are interested in:
- mu = mu[dim]
- if mode == 'diag':
- va = va[dim]
- elps = N.outer(mu, N.ones(npoints))
- elps += N.dot(N.diag(N.sqrt(va)), circle)
- elif mode == 'full':
- va = va[c,:][:,c]
- # Method: compute the cholesky decomp of each cov matrix, that is
- # compute cova such as va = cova * cova'
- # WARN: scipy is different than matlab here, as scipy computes a lower
- # triangular cholesky decomp:
- # - va = cova * cova' (scipy)
- # - va = cova' * cova (matlab)
- # So take care when comparing results with matlab !
- cova = lin.cholesky(va)
- elps = N.outer(mu, N.ones(npoints))
- elps += N.dot(cova, circle)
- else:
- raise DenParam("var mode not recognized")
-
- return elps[0, :], elps[1, :]
-
-if __name__ == "__main__":
- import pylab
-
- #=========================================
- # Test plotting a simple diag 2d variance:
- #=========================================
- va = N.array([5, 3])
- mu = N.array([2, 3])
-
- # Generate a multivariate gaussian of mean mu and covariance va
- X = randn(1e3, 2)
- Yc = N.dot(N.diag(N.sqrt(va)), X.transpose())
- Yc = Yc.transpose() + mu
-
- # Plotting
- Xe, Ye = gauss_ell(mu, va, npoints = 100)
- pylab.figure()
- pylab.plot(Yc[:, 0], Yc[:, 1], '.')
- pylab.plot(Xe, Ye, 'r')
-
- #=========================================
- # Test plotting a simple full 2d variance:
- #=========================================
- va = N.array([[0.2, 0.1],[0.1, 0.5]])
- mu = N.array([0, 3])
-
- # Generate a multivariate gaussian of mean mu and covariance va
- X = randn(1e3, 2)
- Yc = N.dot(lin.cholesky(va), X.transpose())
- Yc = Yc.transpose() + mu
-
- # Plotting
- Xe, Ye = gauss_ell(mu, va, npoints = 100, level=0.95)
- pylab.figure()
- pylab.plot(Yc[:, 0], Yc[:, 1], '.')
- pylab.plot(Xe, Ye, 'r')
- pylab.show()
Deleted: trunk/Lib/sandbox/pyem/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gauss_mix.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/gauss_mix.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,438 +0,0 @@
-# /usr/bin/python
-# Last Change: Tue Oct 03 06:00 PM 2006 J
-
-# Module to implement GaussianMixture class.
-
-import numpy as N
-from numpy.random import randn, rand
-import numpy.linalg as lin
-import densities
-
-MAX_DEV = 1e-10
-MAX_COND = 1e10
-
-# Right now, two main usages of a Gaussian Model are possible
-# - init a Gaussian Model with meta-parameters, and trains it
-# - set-up a Gaussian Model to sample it, draw ellipsoides
-# of confidences. In this case, we would like to init it with
-# known values of parameters. This can be done with the class method
-# fromval
-#
-# For now, we have to init with meta-parameters, and set
-# the parameters afterward. There should be a better way ?
-
-# TODO:
-# - change bounds methods of GM class instanciations so that it cannot
-# be used as long as w, mu and va are not set
-# - We have to use scipy now for chisquare pdf, so there may be other
-# methods to be used, ie for implementing random index.
-# - there is no check on internal state of the GM, that is does w, mu and va values
-# make sense (eg singular values)
-# - plot1d is still very rhough. There should be a sensible way to
-# modify the result plot (maybe returns a dic with global pdf, component pdf and
-# fill matplotlib handles). Should be coherent with plot
-class GmParamError:
- """Exception raised for errors in gmm params
-
- Attributes:
- expression -- input expression in which the error occurred
- message -- explanation of the error
- """
- def __init__(self, message):
- self.message = message
-
- def __str__(self):
- return self.message
-
-class GM:
- """Gaussian Mixture class. This is a simple container class
- to hold Gaussian Mixture parameters (weights, mean, etc...).
- It can also draw itself (confidence ellipses) and samples itself.
-
- Is initiated by giving dimension, number of components and
- covariance mode"""
-
- # I am not sure it is useful to have a spherical mode...
- _cov_mod = ['diag', 'full']
-
- def __init__(self, d, k, mode = 'diag'):
- """Init a Gaussian model of k components, each component being a
- d multi-variate Gaussian, with covariance matrix of style mode"""
- if mode not in self._cov_mod:
- raise GmParamError("mode %s not recognized" + str(mode))
-
- self.d = d
- self.k = k
- self.mode = mode
-
- # Init to 0 all parameters, with the right dimensions.
- # Not sure this is useful in python from an efficiency POV ?
- self.w = N.zeros(k)
- self.mu = N.zeros((k, d))
- if mode == 'diag':
- self.va = N.zeros((k, d))
- elif mode == 'full':
- self.va = N.zeros((k * d, d))
-
- self.is_valid = False
-
- def set_param(self, weights, mu, sigma):
- """Set parameters of the model. Args should
- be conformant with metparameters d and k given during
- initialisation"""
- k, d, mode = check_gmm_param(weights, mu, sigma)
- if not k == self.k:
- raise GmParamError("Number of given components is %d, expected %d"
- % (shape(k), shape(self.k)))
- if not d == self.d:
- raise GmParamError("Dimension of the given model is %d, expected %d"
- % (shape(d), shape(self.d)))
- if not mode == self.mode and not d == 1:
- raise GmParamError("Given covariance mode is %s, expected %s"
- % (mode, self.mode))
- self.w = weights
- self.mu = mu
- self.va = sigma
-
- self.is_valid = True
-
- @classmethod
- def fromvalues(cls, weights, mu, sigma):
- """This class method can be used to create a GM model
- directly from its parameters weights, mean and variance
-
- w, mu, va = GM.gen_param(d, k)
- gm = GM(d, k)
- gm.set_param(w, mu, va)
-
- and
-
- w, mu, va = GM.gen_param(d, k)
- gm = GM.fromvalue(w, mu, va)
-
- Are equivalent """
- k, d, mode = check_gmm_param(weights, mu, sigma)
- res = cls(d, k, mode)
- res.set_param(weights, mu, sigma)
- return res
-
- def sample(self, nframes):
- """ Sample nframes frames from the model """
- if not self.is_valid:
- raise GmParamError("""Parameters of the model has not been
- set yet, please set them using self.set_param()""")
-
- # State index (ie hidden var)
- S = gen_rand_index(self.w, nframes)
- # standard gaussian
- X = randn(nframes, self.d)
-
- if self.mode == 'diag':
- X = self.mu[S, :] + X * N.sqrt(self.va[S,:])
- elif self.mode == 'full':
- # Faster:
- cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1]))
- for i in range(self.k):
- # Using cholesky looks more stable than sqrtm; sqrtm is not
- # available in numpy anyway, only in scipy...
- cho[i] = lin.cholesky(self.va[i*self.d:i*self.d+self.d,:])
-
- for s in range(self.k):
- tmpind = N.where(S == s)[0]
- X[tmpind] = N.dot(X[tmpind], cho[s].transpose()) + self.mu[s]
- else:
- raise GmParamError('cov matrix mode not recognized, this is a bug !')
-
- return X
-
- def conf_ellipses(self, *args, **kargs):
- """Returns a list of confidence ellipsoids describing the Gmm
- defined by mu and va. Check densities.gauss_ell for details
-
- Returns:
- -Xe: a list of x coordinates for the ellipses (Xe[i] is
- the array containing x coordinates of the ith Gaussian)
- -Ye: a list of y coordinates for the ellipses
-
- Example:
- Suppose we have w, mu and va as parameters for a mixture, then:
-
- gm = GM(d, k)
- gm.set_param(w, mu, va)
- X = gm.sample(1000)
- Xe, Ye = gm.conf_ellipsoids()
- pylab.plot(X[:,0], X[:, 1], '.')
- for k in len(w):
- pylab.plot(Xe[k], Ye[k], 'r')
-
- Will plot samples X draw from the mixture model, and
- plot the ellipses of equi-probability from the mean with
- fixed level of confidence 0.39. """
- if not self.is_valid:
- raise GmParamError("""Parameters of the model has not been
- set yet, please set them using self.set_param()""")
-
- Xe = []
- Ye = []
- if self.mode == 'diag':
- for i in range(self.k):
- xe, ye = densities.gauss_ell(self.mu[i,:], self.va[i,:],
- *args, **kargs)
- Xe.append(xe)
- Ye.append(ye)
- elif self.mode == 'full':
- for i in range(self.k):
- xe, ye = densities.gauss_ell(self.mu[i,:],
- self.va[i*self.d:i*self.d+self.d,:],
- *args, **kargs)
- Xe.append(xe)
- Ye.append(ye)
-
- return Xe, Ye
-
- def check_state(self):
- """
- """
- if not self.is_valid:
- raise GmParamError("""Parameters of the model has not been
- set yet, please set them using self.set_param()""")
-
- if self.mode == 'full':
- raise NotImplementedError, "not implemented for full mode yet"
-
- # # How to check w: if one component is negligeable, what shall
- # # we do ?
- # M = N.max(self.w)
- # m = N.min(self.w)
-
- # maxc = m / M
-
- # Check condition number for cov matrix
- cond = N.zeros(self.k)
- ava = N.absolute(self.va)
- for c in range(self.k):
- cond[c] = N.amax(ava[c,:]) / N.amin(ava[c,:])
-
- print cond
-
- def gen_param(self, d, nc, varmode = 'diag', spread = 1):
- """Generate valid parameters for a gaussian mixture model.
- d is the dimension, nc the number of components, and varmode
- the mode for cov matrices.
-
- This is a class method.
-
- Returns: w, mu, va
- """
- w = abs(randn(nc))
- w = w / sum(w, 0)
-
- mu = spread * randn(nc, d)
- if varmode == 'diag':
- va = abs(randn(nc, d))
- elif varmode == 'full':
- va = randn(nc * d, d)
- for k in range(nc):
- va[k*d:k*d+d] = N.dot( va[k*d:k*d+d],
- va[k*d:k*d+d].transpose())
- else:
- raise GmParamError('cov matrix mode not recognized')
-
- return w, mu, va
-
- gen_param = classmethod(gen_param)
-
- def plot(self, *args, **kargs):
- """Plot the ellipsoides directly for the model
-
- Returns a list of lines, so that their style can be modified. By default,
- the style is red color, and nolegend for all of them.
-
- Does not work for 1d"""
- if not self.is_valid:
- raise GmParamError("""Parameters of the model has not been
- set yet, please set them using self.set_param()""")
-
- k = self.k
- Xe, Ye = self.conf_ellipses(*args, **kargs)
- try:
- import pylab as P
- return [P.plot(Xe[i], Ye[i], 'r', label='_nolegend_')[0] for i in range(k)]
- #for i in range(k):
- # P.plot(Xe[i], Ye[i], 'r')
- except ImportError:
- raise GmParamError("matplotlib not found, cannot plot...")
-
- def plot1d(self, level = 0.5, fill = 0, gpdf = 0):
- """This function plots the pdfs of each component of the model.
- If gpdf is 1, also plots the global pdf. If fill is 1, fill confidence
- areas using level argument as a level value
-
- Returns a dictionary h of plot handles so that their properties can
- be modified (eg color, label, etc...):
- - h['pdf'] is a list of lines, one line per component pdf
- - h['gpdf'] is the line for the global pdf
- - h['conf'] is a list of filling area
- """
- # This is not optimized at all, may be slow. Should not be
- # difficult to make much faster, but it is late, and I am lazy
- if not self.d == 1:
- raise GmParamError("the model is not one dimensional model")
- from scipy.stats import norm
- nrm = norm(0, 1)
- pval = N.sqrt(self.va[:,0]) * nrm.ppf((1+level)/2)
-
- # Compute reasonable min/max for the normal pdf
- mc = 3
- std = N.sqrt(self.va[:,0])
- m = N.amin(self.mu[:, 0] - mc * std)
- M = N.amax(self.mu[:, 0] + mc * std)
-
- np = 500
- x = N.linspace(m, M, np)
- Yf = N.zeros(np)
- Yt = N.zeros(np)
-
- # Prepare the dic of plot handles to return
- ks = ['pdf', 'conf', 'gpdf']
- hp = dict((i,[]) for i in ks)
- try:
- import pylab as P
- for c in range(self.k):
- y = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
- N.exp(-(x-self.mu[c][0])**2/(2*std[c]**2))
- Yt += y
- h = P.plot(x, y, 'r', label ='_nolegend_')
- hp['pdf'].extend(h)
- if fill:
- #P.axvspan(-pval[c] + self.mu[c][0], pval[c] + self.mu[c][0],
- # facecolor = 'b', alpha = 0.2)
- id1 = -pval[c] + self.mu[c]
- id2 = pval[c] + self.mu[c]
- xc = x[:, N.where(x>id1)[0]]
- xc = xc[:, N.where(xc<id2)[0]]
- Yf = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
- N.exp(-(xc-self.mu[c][0])**2/(2*std[c]**2))
- xc = N.concatenate(([xc[0]], xc, [xc[-1]]))
- Yf = N.concatenate(([0], Yf, [0]))
- h = P.fill(xc, Yf,
- facecolor = 'b', alpha = 0.1, label='_nolegend_')
- hp['conf'].extend(h)
- #P.fill([xc[0], xc[0], xc[-1], xc[-1]],
- # [0, Yf[0], Yf[-1], 0], facecolor = 'b', alpha = 0.2)
- if gpdf:
- h = P.plot(x, Yt, 'r:', label='_nolegend_')
- hp['gpdf'] = h
- return hp
- except ImportError:
- raise GmParamError("matplotlib not found, cannot plot...")
-
-# Function to generate a random index: this is kept outside any class,
-# as the function can be useful for other
-def gen_rand_index(p, n):
- """Generate a N samples vector containing random index between 1
- and length(p), each index i with probability p(i)"""
- # TODO Check args here
-
- # TODO: check each value of inverse distribution is
- # different
- invcdf = N.cumsum(p)
- uni = rand(n)
- index = N.zeros(n, dtype=int)
-
- # This one should be a bit faster
- for k in range(len(p)-1, 0, -1):
- blop = N.where(N.logical_and(invcdf[k-1] <= uni,
- uni < invcdf[k]))
- index[blop] = k
-
- return index
-
-def check_gmm_param(w, mu, va):
- """Check that w, mu and va are valid parameters for
- a mixture of gaussian: w should sum to 1, there should
- be the same number of component in each param, the variances
- should be positive definite, etc...
-
- Params:
- w = vector or list of weigths of the mixture (K elements)
- mu = matrix: K * d
- va = list of variances (vector K * d or square matrices Kd * d)
-
- returns:
- K = number of components
- d = dimension
- mode = 'diag' if diagonal covariance, 'full' of full matrices
- """
-
- # Check that w is valid
- if N.fabs(N.sum(w, 0) - 1) > MAX_DEV:
- raise GmParamError('weight does not sum to 1')
-
- if not len(w.shape) == 1:
- raise GmParamError('weight is not a vector')
-
- # Check that mean and va have the same number of components
- K = len(w)
-
- if N.ndim(mu) < 2:
- msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
- raise GmParamError(msg)
- if N.ndim(va) < 2:
- msg = """va should be a K,d / K *d, d matrix, and a row vector if
- only 1 diag comp"""
- raise GmParamError(msg)
-
- (Km, d) = mu.shape
- (Ka, da) = va.shape
-
- if not K == Km:
- msg = "not same number of component in mean and weights"
- raise GmParamError(msg)
-
- if not d == da:
- msg = "not same number of dimensions in mean and variances"
- raise GmParamError(msg)
-
- if Km == Ka:
- mode = 'diag'
- else:
- mode = 'full'
- if not Ka == Km*d:
- msg = "not same number of dimensions in mean and variances"
- raise GmParamError(msg)
-
- return K, d, mode
-
-if __name__ == '__main__':
- # Meta parameters:
- # - k = number of components
- # - d = dimension
- # - mode : mode of covariance matrices
- d = 5
- k = 4
-
- # Now, drawing a model
- mode = 'full'
- nframes = 1e3
-
- # Build a model with random parameters
- w, mu, va = GM.gen_param(d, k, mode, spread = 3)
- gm = GM.fromvalues(w, mu, va)
-
- # Sample nframes frames from the model
- X = gm.sample(nframes)
-
- # Plot the data
- import pylab as P
- P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
-
- # Real confidence ellipses with confidence level
- level = 0.50
- h = gm.plot(level=level)
-
- # set the first ellipse label, which will appear in the legend
- h[0].set_label('confidence ell at level ' + str(level))
-
- P.legend(loc = 0)
- P.show()
Deleted: trunk/Lib/sandbox/pyem/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gmm_em.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/gmm_em.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,394 +0,0 @@
-# /usr/bin/python
-# Last Change: Fri Oct 06 05:00 PM 2006 J
-
-# TODO:
-# - which methods to avoid va shrinking to 0 ? There are several options,
-# not sure which ones are appropriates
-# - improve EM trainer
-# - online EM
-
-import numpy as N
-import numpy.linalg as lin
-from numpy.random import randn
-#import _c_densities as densities
-import densities
-from kmean import kmean
-from gauss_mix import GM
-
-# Error classes
-class GmmError(Exception):
- """Base class for exceptions in this module."""
- pass
-
-class GmmParamError:
- """Exception raised for errors in gmm params
-
- Attributes:
- expression -- input expression in which the error occurred
- message -- explanation of the error
- """
- def __init__(self, message):
- self.message = message
-
- def __str__(self):
- return self.message
-
-# Not sure yet about how to design different mixture models. Most of the code
-# is different # (pdf, update part of EM, etc...) and I am not sure it makes
-# sense to use inheritance for # interface specification in python, since its
-# dynamic type systeme.
-
-# Anyway, a mixture class should encapsulates all details concerning a mixture model:
-# - internal parameters for the pdfs
-# - can compute sufficient statistics for EM
-# - can sample a model
-# - can generate random valid parameters for a new pdf (using class method)
-class MixtureModel:
- pass
-
-class ExpMixtureModel(MixtureModel):
- """Class to model mixture of exponential pdf (eg Gaussian, exponential, Laplace,
- etc..). This is a special case because some parts of EM are common for those
- models..."""
- pass
-
-class GMM(ExpMixtureModel):
- """ A class to model a Gaussian Mixture Model (GMM). An instance of
- this class is created by giving weights, mean and variances in the ctor.
- An instanciated object can be sampled, trained by EM.
-
- The class method gen_model can be used without instanciation."""
-
- def init_kmean(self, data, niter = 5):
- """ Init the model with kmean."""
- k = self.gm.k
- d = self.gm.d
- init = data[0:k, :]
-
- (code, label) = kmean(data, init, niter)
-
- w = N.ones(k) / k
- mu = code.copy()
- if self.gm.mode == 'diag':
- va = N.zeros((k, d))
- for i in range(k):
- for j in range(d):
- va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
- elif self.gm.mode == 'full':
- va = N.zeros((k*d, d))
- for i in range(k):
- va[i*d:i*d+d,:] = \
- N.cov(data[N.where(label==i)], rowvar = 0)
- else:
- raise GmmParamError("mode " + str(mode) + " not recognized")
-
- self.gm.set_param(w, mu, va)
-
- def init_random(self, data):
- """ Init the model at random."""
- k = self.gm.k
- d = self.gm.d
- if mode == 'diag':
- w = N.ones(k) / k
- mu = randn(k, d)
- va = N.fabs(randn(k, d))
- else:
- raise GmmParamError("""init_random not implemented for
- mode %s yet""", mode)
-
- self.gm.set_param(w, mu, va)
-
- # TODO:
- # - format of parameters ? For variances, list of variances matrix,
- # keep the current format, have 3d matrices ?
- # - To handle the different modes, we could do something "fancy" such as
- # replacing methods, to avoid checking cases everywhere and unconsistency.
- def __init__(self, gm, init = 'kmean'):
- """ Initialize a GMM with weight w, mean mu and variances va, and initialization
- method for training init (kmean by default)"""
- self.gm = gm
-
- # Possible init methods
- init_methods = {'kmean': self.init_kmean, 'random' : self.init_random}
-
- if init not in init_methods:
- raise GmmParamError('init method %s not recognized' + str(init))
-
- self.init = init_methods[init]
-
- def sufficient_statistics(self, data):
- """ Return normalized and non-normalized sufficient statistics
- from the model.
-
- Computes the latent variable distribution (a
- posteriori probability) knowing the explicit data
- for the Gaussian model (w, mu, var): gamma(t, i) =
- P[state = i | observation = data(t); w, mu, va]
-
- This is basically the E step of EM for GMM."""
- n = data.shape[0]
-
- # compute the gaussian pdf
- tgd = multiple_gauss_den(data, self.gm.mu, self.gm.va)
- # multiply by the weight
- tgd *= self.gm.w
- # Normalize to get a pdf
- gd = tgd / N.sum(tgd, axis=1)[:, N.newaxis]
-
- return gd, tgd
-
- def update_em(self, data, gamma):
- """Computes update of the Gaussian Mixture Model (M step)
- from the a posteriori pdf, computed by gmm_posterior
- (E step).
- """
- k = self.gm.k
- d = self.gm.d
- n = data.shape[0]
- invn = 1.0/n
- mGamma = N.sum(gamma, axis = 0)
-
- if self.gm.mode == 'diag':
- mu = N.zeros((k, d))
- va = N.zeros((k, d))
- gamma = gamma.T
- for c in range(k):
- x = N.dot(gamma[c:c+1,:], data)[0,:]
- xx = N.dot(gamma[c:c+1,:], data ** 2)[0,:]
-
- mu[c,:] = x / mGamma[c]
- va[c,:] = xx / mGamma[c] - mu[c,:] ** 2
- w = invn * mGamma
-
- elif self.gm.mode == 'full':
- # In full mode, this is the bottleneck: the triple loop
- # kills performances. This is pretty straightforward
- # algebra, so computing it in C should not be too difficult
- mu = N.zeros((k, d))
- va = N.zeros((k*d, d))
-
- gamma = gamma.transpose()
- for c in range(k):
- #x = N.sum(N.outer(gamma[:, c],
- # N.ones((1, d))) * data, axis = 0)
- x = N.dot(gamma[c:c+1,:], data)[0,:]
- xx = N.zeros((d, d))
-
- # This should be much faster than recursing on n...
- for i in range(d):
- for j in range(d):
- xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[c,:], axis = 0)
-
- mu[c,:] = x / mGamma[c]
- va[c*d:c*d+d,:] = xx / mGamma[c] - \
- N.outer(mu[c,:], mu[c,:])
- w = invn * mGamma
- else:
- raise GmmParamError("varmode not recognized")
-
- self.gm.set_param(w, mu, va)
-
-class EM:
- """An EM trainer. An EM trainer
- trains from data, with a model
-
- Not really useful yet"""
- def __init__(self):
- pass
-
- def train(self, data, model, maxiter = 10, thresh = 1e-5):
- """
- Train a model using data, and stops when the likelihood fails
- behind a threshold, or when the number of iterations > niter,
- whichever comes first
-
- Args:
- - data: contains the observed features, one row is one frame, ie one
- observation of dimension d
- - model: object of class Mixture
- - maxiter: maximum number of iterations
-
- The model is trained, and its parameters updated accordingly.
-
- Returns:
- likelihood (one value per iteration).
- """
-
- # Initialize the data (may do nothing depending on the model)
- model.init(data)
-
- # Likelihood is kept
- like = N.zeros(maxiter)
-
- # Em computation, with computation of the likelihood
- g, tgd = model.sufficient_statistics(data)
- like[0] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
- model.update_em(data, g)
- for i in range(1, maxiter):
- g, tgd = model.sufficient_statistics(data)
- like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
- model.update_em(data, g)
- if has_em_converged(like[i], like[i-1], thresh):
- return like[0:i]
-
- return like
-
-# Misc functions
-def has_em_converged(like, plike, thresh):
- """ given likelihood of current iteration like and previous
- iteration plike, returns true is converged: based on comparison
- of the slope of the likehood with thresh"""
- diff = N.abs(like - plike)
- avg = 0.5 * (N.abs(like) + N.abs(plike))
- if diff / avg < thresh:
- return True
- else:
- return False
-
-def multiple_gauss_den(data, mu, va):
- """Helper function to generate several Gaussian
- pdf (different parameters) from the same data"""
- mu = N.atleast_2d(mu)
- va = N.atleast_2d(va)
-
- K = mu.shape[0]
- n = data.shape[0]
- d = data.shape[1]
-
- y = N.zeros((K, n))
- if mu.size == va.size:
- for i in range(K):
- y[i] = densities.gauss_den(data, mu[i, :], va[i, :])
- return y.T
- else:
- for i in range(K):
- y[:, i] = densities.gauss_den(data, mu[i, :],
- va[d*i:d*i+d, :])
-
- return y
-
-if __name__ == "__main__":
- import copy
- #=============================
- # Simple GMM with 5 components
- #=============================
-
- #+++++++++++++++++++++++++++++
- # Meta parameters of the model
- # - k: Number of components
- # - d: dimension of each Gaussian
- # - mode: Mode of covariance matrix: full or diag
- # - nframes: number of frames (frame = one data point = one
- # row of d elements
- k = 2
- d = 1
- mode = 'full'
- nframes = 1e3
-
- #+++++++++++++++++++++++++++++++++++++++++++
- # Create an artificial GMM model, samples it
- #+++++++++++++++++++++++++++++++++++++++++++
- print "Generating the mixture"
- # Generate a model with k components, d dimensions
- w, mu, va = GM.gen_param(d, k, mode, spread = 3)
- gm = GM(d, k, mode)
- gm.set_param(w, mu, va)
-
- # Sample nframes frames from the model
- data = gm.sample(nframes)
-
- #++++++++++++++++++++++++
- # Learn the model with EM
- #++++++++++++++++++++++++
-
- # Init the model
- print "Init a model for learning, with kmean for initialization"
- lgm = GM(d, k, mode)
- gmm = GMM(lgm, 'kmean')
- gmm.init(data)
-
- # Keep the initialized model for drawing
- gm0 = copy.copy(lgm)
-
- # The actual EM, with likelihood computation
- niter = 10
- like = N.zeros(niter)
-
- print "computing..."
- for i in range(niter):
- g, tgd = gmm.sufficient_statistics(data)
- like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
- gmm.update_em(data, g)
- # # Alternative form, by using EM class: as the EM class
- # # is quite rudimentary now, it is not very useful, just save
- # # a few lines
- # em = EM()
- # like = em.train(data, gmm, niter)
-
- #+++++++++++++++
- # Draw the model
- #+++++++++++++++
- print "drawing..."
- import pylab as P
- P.subplot(2, 1, 1)
-
- if not d == 1:
- # Draw what is happening
- P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
-
- # Real confidence ellipses
- Xre, Yre = gm.conf_ellipses()
- P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
- for i in range(1,k):
- P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
-
- # Initial confidence ellipses as found by kmean
- X0e, Y0e = gm0.conf_ellipses()
- P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
- for i in range(1,k):
- P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
-
- # Values found by EM
- Xe, Ye = lgm.conf_ellipses()
- P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
- for i in range(1,k):
- P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
- P.legend(loc = 0)
- else:
- # Real confidence ellipses
- h = gm.plot1d()
- [i.set_color('g') for i in h['pdf']]
- h['pdf'][0].set_label('true pdf')
-
- # Initial confidence ellipses as found by kmean
- h0 = gm0.plot1d()
- [i.set_color('k') for i in h0['pdf']]
- h0['pdf'][0].set_label('initial pdf')
-
- # Values found by EM
- hl = lgm.plot1d(fill = 1, level = 0.66)
- [i.set_color('r') for i in hl['pdf']]
- hl['pdf'][0].set_label('pdf found by EM')
-
- P.legend(loc = 0)
-
- P.subplot(2, 1, 2)
- P.plot(like)
- P.title('log likelihood')
-
- # #++++++++++++++++++
- # # Export the figure
- # #++++++++++++++++++
- # F = P.gcf()
- # DPI = F.get_dpi()
- # DefaultSize = F.get_size_inches()
- # # the default is 100dpi for savefig:
- # F.savefig("example1.png")
-
- # # Now make the image twice as big, while keeping the fonts and all the
- # # same size
- # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) )
- # Size = F.get_size_inches()
- # print "Size in Inches", Size
- # F.savefig("example2.png")
- P.show()
Deleted: trunk/Lib/sandbox/pyem/pyem/info.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/info.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/info.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,35 +0,0 @@
-"""
-Routines for Gaussian Mixture Models
-and learning with Expectation Maximization
-==========================================
-
-This module contains classes and function to compute multivariate Gaussian densities
-(diagonal and full covariance matrices), Gaussian mixtures, Gaussian mixtures models
-and an Em trainer.
-
-More specifically, the module contains the following file:
-
-- densities.py: functions to compute multivariate Gaussian pdf and ellipsoides of
-confidence (gauss_den)
-- gauss_mix.py: defines the GM (Gaussian Mixture) class. A Gaussian Mixture can be
-created from its parameters weights, mean and variances, or from its meta parameters
-d (dimension of the Gaussian) and k (number of components in the mixture). A Gaussian
-Model can then be sampled or plot (if d>1, plot confidence ellipsoids projected on
-2 chosen dimensions, if d == 1, plot the pdf of each component and fill the zone
-of confidence for a given level)
-- gmm_em.py: defines a class GMM (Gaussian Mixture Model). This class is constructed
-from a GM model gm, and can be used to train gm. The GMM can be initiated by
-kmean or at random, and can compute sufficient statistics, and update its parameters
-from the sufficient statistics.
-- kmean.py: implements a kmean algorithm. We cannot use scipy.cluster.vq kmeans, since
-its does not give membership of observations.
-
-Example of use: simply execute gmm_em.py for an example of training a GM and plotting
-the results compared to true values
-
-Copyright: David Cournapeau 2006
-License: BSD-style (see LICENSE.txt in main source directory)
-"""
-
-depends = ['linalg', 'stats']
-ignore = False
Deleted: trunk/Lib/sandbox/pyem/pyem/kmean.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/kmean.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/kmean.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,76 +0,0 @@
-# /usr/bin/python
-# Last Change: Thu Sep 28 01:00 PM 2006 J
-
-#TODO:
-# - a demo for kmeans
-
-import numpy as N
-
-def _py_vq(data, code):
- """ Please do not use directly. Use kmean instead"""
- # No attempt to be efficient has been made...
- (n, d) = data.shape
- (k, d) = code.shape
-
- label = N.zeros(n, int)
- for i in range(n):
- d = N.sum((data[i, :] - code) ** 2, 1)
- label[i] = N.argmin(d)
-
- return label
-
-# Try to import pyrex function for vector quantization. If not available,
-# falls back on pure python implementation.
-#%KMEANIMPORT%
-#try:
-# from scipy.cluster.vq import kmeans as kmean
-#except ImportError:
-# try:
-# from c_gmm import _vq
-# except:
-# print """c_gmm._vq not found, using pure python implementation instead.
-# Kmean will be REALLY slow"""
-# _vq = _py_vq
-try:
- from scipy.cluster.vq import vq
- print "using scipy.cluster.vq"
- def _vq(*args, **kw): return vq(*args, **kw)[0]
-except ImportError:
- try:
- from c_gmm import _vq
- print "using pyrex vq"
- except ImportError:
- print """c_gmm._vq not found, using pure python implementation instead.
- Kmean will be REALLY slow"""
- _vq = _py_vq
-
-def kmean(data, init, iter = 10):
- """Simple kmean implementation for EM. Runs iter iterations.
-
- returns a tuple (code, label), where code are the final
- centroids, and label are the class label indec for each
- frame (ie row) of data"""
-
- data = N.atleast_2d(data)
- init = N.atleast_2d(init)
-
- (n, d) = data.shape
- (k, d1) = init.shape
-
- if not d == d1:
- msg = "data and init centers do not have same dimensions..."
- raise GmmParamError(msg)
-
- code = N.asarray(init.copy())
- for i in range(iter):
- # Compute the nearest neighbour for each obs
- # using the current code book
- label = _vq(data, code)
- # Update the code by computing centroids using the new code book
- for j in range(k):
- code[j,:] = N.mean(data[N.where(label==j)], axis=0)
-
- return code, label
-
-if __name__ == "__main__":
- pass
Deleted: trunk/Lib/sandbox/pyem/pyem/online_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/online_em.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/online_em.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,369 +0,0 @@
-# /usr/bin/python
-# Last Change: Thu Oct 12 09:00 PM 2006 J
-
-#---------------------------------------------
-# This is not meant to be used yet !!!! I am
-# not sure how to integrate this stuff inside
-# the package yet. The cases are:
-# - we have a set of data, and we want to test online EM
-# compared to normal EM
-# - we do not have all the data before putting them in online EM:
-# eg current frame depends on previous frame in some way.
-
-import numpy as N
-from numpy import repmat, mean
-from numpy.testing import assert_array_almost_equal, assert_array_equal
-
-from gmm_em import ExpMixtureModel, GMM, EM, multiple_gauss_den
-from gauss_mix import GM
-from kmean import kmean
-
-print "======================================================"
-import traceback
-f = traceback.extract_stack()
-print f[0][0] + " This is beta code, don't use it ! "
-print "======================================================"
-
-# Error classes
-class OnGmmError(Exception):
- """Base class for exceptions in this module."""
- pass
-
-class OnGmmParamError:
- """Exception raised for errors in gmm params
-
- Attributes:
- expression -- input expression in which the error occurred
- message -- explanation of the error
- """
- def __init__(self, message):
- self.message = message
-
- def __str__(self):
- return self.message
-
-class OnGMM(ExpMixtureModel):
- def init_kmean(self, init_data, niter = 5):
- """ Init the model at random."""
- k = self.gm.k
- d = self.gm.d
- if mode == 'diag':
- w = N.ones(k) / k
-
- # Init the internal state of EM
- self.cx = N.outer(w, mean(init_data, 0))
- self.cxx = N.outer(w, mean(init_data ** 2, 0))
-
- # w, mu and va init is the same that in the standard case
- (code, label) = kmean(init_data, init_data[0:k, :], niter)
- mu = code.copy()
- va = N.zeros((k, d))
- for i in range(k):
- for j in range(d):
- va [i,j] = N.cov(init_data[N.where(label==i), j], rowvar = 0)
- else:
- raise OnGmmParamError("""init_online not implemented for
- mode %s yet""", mode)
-
- self.gm.set_param(w, mu, va)
- self.cw = w
- self.cmu = mu
- self.cva = va
-
- self.pw = self.cw.copy()
- self.pmu = self.cmu.copy()
- self.pva = self.cva.copy()
-
- def __init__(self, gm, init_data, init = 'kmean'):
- self.gm = gm
-
- # Possible init methods
- init_methods = {'kmean' : self.init_kmean}
-
- self.init = init_methods[init]
-
- def sufficient_statistics(self, frame, nu):
- """ sufficient_statistics(frame, nu)
-
- frame has to be rank 2 !"""
- gamma = multiple_gauss_den(frame, self.pmu, self.pva)[0]
- gamma *= self.pw
- gamma /= N.sum(gamma)
- # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
- self.cw = (1 - nu) * self.cw + nu * gamma
-
- return gamma
-
- def update_em(self, frame, gamma, nu):
- cx = self.cx
- cxx = self.cxx
- cmu = self.cmu
- cva = self.cva
- for k in range(self.gm.k):
- cx[k] = (1 - nu) * cx[k] + nu * frame * gamma[k]
- cxx[k] = (1 - nu) * cxx[k] + nu * frame ** 2 * gamma[k]
-
- cmu[k] = cx[k] / cw[k]
- cva[k] = cxx[k] / cw[k] - cmu[k] ** 2
-
-if __name__ == "__main__":
- import copy
- #=============================
- # Simple GMM with 2 components
- #=============================
-
- #+++++++++++++++++++++++++++++
- # Meta parameters of the model
- # - k: Number of components
- # - d: dimension of each Gaussian
- # - mode: Mode of covariance matrix: full or diag
- # - nframes: number of frames (frame = one data point = one
- # row of d elements
- k = 2
- d = 1
- mode = 'diag'
- nframes = int(1e3)
- emiter = 2
-
- #+++++++++++++++++++++++++++++++++++++++++++
- # Create an artificial GMM model, samples it
- #+++++++++++++++++++++++++++++++++++++++++++
- print "Generating the mixture"
- # Generate a model with k components, d dimensions
- w, mu, va = GM.gen_param(d, k, mode, spread = 1.5)
- gm = GM.fromvalues(w, mu, va)
-
- # Sample nframes frames from the model
- data = gm.sample(nframes, )
-
- #++++++++++++++++++++++++
- # Learn the model with EM
- #++++++++++++++++++++++++
-
- # Init the model
- print "Init a model for learning, with kmean for initialization"
- lgm = GM(d, k, mode)
- gmm = GMM(lgm, 'kmean')
- gmm.init(data)
-
- # Keep the initialized model for drawing
- gm0 = copy.copy(lgm)
-
- # The actual EM, with likelihood computation
- like = N.zeros(emiter)
- print "computing..."
- for i in range(emiter):
- g, tgd = gmm.sufficient_statistics(data)
- like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
- gmm.update_em(data, g)
-
- #+++++++++++++++++++++++++++++++
- # Learn the model with Online EM
- #+++++++++++++++++++++++++++++++
- ogm = GM(d, k, mode)
- ogmm = OnGMM(ogm, 'kmean')
- #init_data = data[0:nframes / 20, :]
- init_data = data
- ogmm.init(init_data)
-
- ogm2 = GM(d, k, mode)
- ogmm2 = OnGMM(ogm2, 'kmean')
- #init_data = data[0:nframes / 20, :]
- init_data = data
- ogmm2.init(init_data)
-
- ogm0 = copy.copy(ogm)
- assert_array_equal(ogm0.w, gm0.w)
- assert_array_equal(ogm0.mu, gm0.mu)
- assert_array_equal(ogm0.va, gm0.va)
-
- ogm02 = copy.copy(ogm2)
- assert_array_equal(ogm02.w, gm0.w)
- assert_array_equal(ogm02.mu, gm0.mu)
- assert_array_equal(ogm02.va, gm0.va)
-
- w0 = gm0.w.copy()
- mu0 = gm0.mu.copy()
- va0 = gm0.va.copy()
-
- cx = ogmm.cx
- cxx = ogmm.cxx
-
- cw = w0.copy()
- cmu = mu0.copy()
- cva = va0.copy()
-
- pw = cw.copy()
- pmu = cmu.copy()
- pva = cva.copy()
-
- # Forgetting param
- ku = 0.005
- t0 = 200
- lamb = N.ones((nframes, 1))
- lamb[0] = 0
- nu0 = 1.0
- #lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0)
- #nu0 = 0.2
- nu = N.zeros((len(lamb), 1))
- nu[0] = nu0
- for i in range(1, len(lamb)):
- nu[i] = 1./(1 + lamb[i] / nu[i-1])
-
- gamma = N.zeros((nframes, k))
- agamma = []
- apw = []
- apmu = []
- apva = []
- print "========== Online Manual ==========="
- for e in range(emiter):
- print "online manual: estep %d, printing p* state " % e
- apw.append(pw.copy())
- apmu.append(pmu.copy())
- apva.append(pva.copy())
- for t in range(nframes):
- gamma[t] = multiple_gauss_den(data[t:t+1, :], pmu, pva)[0]
- gamma[t] *= pw
- gamma[t] /= N.sum(gamma[t])
- # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
- # <x>(t) = cx(t)
- cw = (1 - nu[t]) * cw + nu[t] * gamma[t]
- # loop through each component
- for i in range(k):
- cx[i] = (1 - nu[t]) * cx[i] + nu[t] * data[t, :] * gamma[t, i]
- cxx[i] = (1 - nu[t]) * cxx[i] + nu[t] * data[t, :] ** 2 * gamma[t, i]
-
- cmu[i] = cx[i] / cw[i]
- cva[i] = cxx[i] / cw[i] - cmu[i] ** 2
-
- #pw = cw.copy()
- #pmu = cmu.copy()
- #pva = cva.copy()
- print "gamma[end]: " + str(gamma[-1])
- pw = cw.copy()
- pmu = cmu.copy()
- pva = cva.copy()
- agamma.append(gamma.copy())
-
- gamma2 = N.zeros((nframes, k))
- agamma2 = []
- apw2 = []
- apmu2 = []
- apva2 = []
- print "========== Online Automatic ==========="
- for e in range(emiter):
- print "online automatic: estep %d, printing p* state " % e
- apw2.append(ogmm2.pw.copy())
- apmu2.append(ogmm2.pmu.copy())
- apva2.append(ogmm2.pva.copy())
- for t in range(nframes):
- gamma2[t] = ogmm2.sufficient_statistics(data[t:t+1, :], nu[t])
- #gamma2[t] = multiple_gauss_den(data[t:t+1, :], ogmm2.pmu, ogmm2.pva)[0]
- #gamma2[t] *= ogmm2.pw
- #gamma2[t] /= N.sum(gamma2[t])
- #try:
- # assert_array_equal(agamma, gamma2[t])
- #except AssertionError:
- # print "error for estep %d, step %d" % (e, t)
- # print ogmm2.pw
- # print ogmm2.pmu
- # print ogmm2.pva
- # raise
- ogmm2.update_em(data[t, :], gamma2[t], nu[t])
- #ogmm2.cw = (1 - nu[t]) * ogmm2.cw + nu[t] * agamma
- ## loop through each component
- #for i in range(k):
- # ogmm2.cx[i] = (1 - nu[t]) * ogmm2.cx[i] + nu[t] * data[t, :] * agamma[i]
- # ogmm2.cxx[i] = (1 - nu[t]) * ogmm2.cxx[i] + nu[t] * data[t, :] ** 2 * agamma[i]
-
- # ogmm2.cmu[i] = ogmm2.cx[i] / ogmm2.cw[i]
- # ogmm2.cva[i] = ogmm2.cxx[i] / ogmm2.cw[i] - ogmm2.cmu[i] ** 2
-
- print "gamma[end]: " + str(gamma2[-1])
- agamma2.append(gamma2.copy())
- ogmm2.pw = ogmm2.cw.copy()
- ogmm2.pmu = ogmm2.cmu.copy()
- ogmm2.pva = ogmm2.cva.copy()
-
- # #ogm.set_param(pw, pmu, pva)
- # print "weights off vs on: \n" + str(lgm.w) + "\n " + str(cw)
- # print "mean off vs on: \n" + str(lgm.mu) + "\n " + str(cmu)
- # print "variances off vs on: \n" + str(lgm.va) + "\n " + str(cva)
- # print "weights off vs on2: \n" + str(lgm.w) + "\n " + str(ogmm2.cw)
- # print "mean off vs on2: \n" + str(lgm.mu) + "\n " + str(ogmm2.cmu)
- # print "variances off vs on2: \n" + str(lgm.va) + "\n " + str(ogmm2.cva)
- # assert_array_almost_equal(cw, lgm.w)
- # assert_array_almost_equal(cmu, lgm.mu)
- # assert_array_almost_equal(cva, lgm.va)
- assert_array_equal(ogmm2.pw, pw)
- assert_array_equal(ogmm2.pmu, pmu)
- assert_array_equal(ogmm2.pva, pva)
- assert_array_equal(agamma[0], agamma2[0])
- #assert_array_almost_equal(ogmm2.cw, lgm.w)
- #assert_array_almost_equal(ogmm2.cmu, lgm.mu)
- #assert_array_almost_equal(ogmm2.cva, lgm.va)
- # #+++++++++++++++
- # # Draw the model
- # #+++++++++++++++
- # print "drawing..."
- # import pylab as P
- # P.subplot(2, 1, 1)
-
- # if d == 1:
- # # Real confidence ellipses
- # h = gm.plot1d()
- # [i.set_color('g') for i in h['pdf']]
- # h['pdf'][0].set_label('true pdf')
-
- # # Initial confidence ellipses as found by kmean
- # h0 = gm0.plot1d()
- # [i.set_color('k') for i in h0['pdf']]
- # h0['pdf'][0].set_label('initial pdf')
-
- # # Values found by EM
- # hl = lgm.plot1d(fill = 1, level = 0.66)
- # [i.set_color('r') for i in hl['pdf']]
- # hl['pdf'][0].set_label('pdf found by EM')
-
- # # Values found by Online EM
- # ho = ogm.plot1d(fill = 1, level = 0.66)
- # [i.set_color('b') for i in ho['pdf']]
- # ho['pdf'][0].set_label('pdf found by online EM')
-
- # P.legend(loc = 0)
- # else:
- # # Draw what is happening
- # P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
-
- # # Real confidence ellipses
- # Xre, Yre = gm.conf_ellipses()
- # P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
- # for i in range(1,k):
- # P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
-
- # # Initial confidence ellipses as found by kmean
- # X0e, Y0e = gm0.conf_ellipses()
- # P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
- # for i in range(1,k):
- # P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
-
- # # Values found by EM
- # Xe, Ye = lgm.conf_ellipses()
- # P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
- # for i in range(1,k):
- # P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
- # P.legend(loc = 0)
-
- # # Values found by Online EM
- # Xe, Ye = ogm.conf_ellipses()
- # P.plot(Xe[0], Ye[0], 'm', label = 'confidence ellipsoides found by Online EM')
- # for i in range(1,k):
- # P.plot(Xe[i], Ye[i], 'm', label = '_nolegend_')
- # P.legend(loc = 0)
-
-
- # P.subplot(2, 1, 2)
- # P.plot(like)
- # P.title('log likelihood')
-
- # P.show()
Deleted: trunk/Lib/sandbox/pyem/pyem/profile_data/profile_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/profile_data/profile_densities.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/profile_data/profile_densities.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,54 +0,0 @@
-import numpy as N
-from numpy.random import randn
-from pyem import densities as D
-from pyem import _c_densities as DC
-#import tables
-
-def bench(func, mode = 'diag'):
- #===========================================
- # Diag Gaussian of dimension 20
- #===========================================
- d = 30
- n = 1e5
- niter = 10
-
- print "Compute %d times densities, %d dimension, %d frames" % (niter, d, n)
- # Generate a model with k components, d dimensions
- mu = randn(1, d)
- if mode == 'diag':
- va = abs(randn(1, d))
- elif mode == 'full':
- va = randn(d, d)
- va = N.dot(va, va.transpose())
-
- X = randn(n, d)
- for i in range(niter):
- Y = func(X, mu, va)
-
-def benchpy():
- bench(D.gauss_den)
-
-def benchc():
- bench(DC.gauss_den)
-
-def benchpyfull():
- bench(D.gauss_den, 'full')
-
-def benchcfull():
- bench(DC.gauss_den, 'full')
-
-if __name__ == "__main__":
- import hotshot, hotshot.stats
- profile_file = 'gdenpy.prof'
- prof = hotshot.Profile(profile_file, lineevents=1)
- prof.runcall(benchpy)
- p = hotshot.stats.load(profile_file)
- print p.sort_stats('cumulative').print_stats(20)
- prof.close()
-
- profile_file = 'gdenc.prof'
- prof = hotshot.Profile(profile_file, lineevents=1)
- prof.runcall(benchc)
- p = hotshot.stats.load(profile_file)
- print p.sort_stats('cumulative').print_stats(20)
- prof.close()
Deleted: trunk/Lib/sandbox/pyem/pyem/profile_data/profile_gmm.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/profile_data/profile_gmm.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/profile_data/profile_gmm.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,70 +0,0 @@
-import numpy as N
-from pyem import GM, GMM
-import copy
-
-from pyem._c_densities import gauss_den
-
-def bench1(mode = 'diag'):
- #===========================================
- # GMM of 20 comp, 20 dimension, 1e4 frames
- #===========================================
- d = 15
- k = 30
- nframes = 1e4
- niter = 10
- mode = 'diag'
-
- print "============================================================="
- print "(%d dim, %d components) GMM with %d iterations, for %d frames" \
- % (d, k, niter, nframes)
-
- #+++++++++++++++++++++++++++++++++++++++++++
- # Create an artificial GMM model, samples it
- #+++++++++++++++++++++++++++++++++++++++++++
- print "Generating the mixture"
- # Generate a model with k components, d dimensions
- w, mu, va = GM.gen_param(d, k, mode, spread = 3)
- # gm = GM(d, k, mode)
- # gm.set_param(w, mu, va)
- gm = GM.fromvalues(w, mu, va)
-
- # Sample nframes frames from the model
- data = gm.sample(nframes)
-
- #++++++++++++++++++++++++
- # Learn the model with EM
- #++++++++++++++++++++++++
-
- # Init the model
- print "Init a model for learning, with kmean for initialization"
- lgm = GM(d, k, mode)
- gmm = GMM(lgm, 'kmean')
-
- gmm.init(data)
- # Keep the initialized model for drawing
- gm0 = copy.copy(lgm)
-
- # The actual EM, with likelihood computation
- like = N.zeros(niter)
-
- print "computing..."
- for i in range(niter):
- print "iteration %d" % i
- g, tgd = gmm.sufficient_statistics(data)
- like[i] = N.sum(N.log(N.sum(tgd, 1)))
- gmm.update_em(data, g)
-
-if __name__ == "__main__":
- import hotshot, hotshot.stats
- profile_file = 'gmm.prof'
- prof = hotshot.Profile(profile_file, lineevents=1)
- prof.runcall(bench1)
- p = hotshot.stats.load(profile_file)
- print p.sort_stats('cumulative').print_stats(20)
- prof.close()
- # import profile
- # profile.run('bench1()', 'gmmprof')
- # import pstats
- # p = pstats.Stats('gmmprof')
- # print p.sort_stats('cumulative').print_stats(20)
-
Deleted: trunk/Lib/sandbox/pyem/pyem/setup.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/setup.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/setup.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,142 +0,0 @@
-#! /usr/bin/env python
-# Last Change: Fri Oct 06 09:00 PM 2006 J
-# TODO:
-# - check how to handle cmd line build options with distutils and use
-# it in the building process
-
-""" pyem is a small python package to estimate Gaussian Mixtures Models
-from data, using Expectation Maximization"""
-
-from os.path import join
-from info import version as pyem_version
-
-DISTNAME = 'pyem'
-VERSION = pyem_version
-DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
-AUTHOR ='David Cournapeau',
-AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
-URL ='http://ar.media.kyoto-u.ac.jp/members/david',
-
-def configuration(parent_package='',top_path=None, package_name='pyem'):
- from numpy.distutils.misc_util import Configuration
- config = Configuration(package_name,parent_package,top_path,
- version = VERSION)
- config.add_data_dir('tests')
- config.add_subpackage('profile_data')
- config.add_extension('c_gden',
- #define_macros=[('LIBSVM_EXPORTS', None),
- # ('LIBSVM_DLL', None)],
- sources=[join('src', 'c_gden.c')])
-
- return config
-
-if __name__ == "__main__":
- from numpy.distutils.core import setup
- #setup(**configuration(top_path='').todict())
- setup(**configuration(top_path='',
- package_name='scipy.sandbox.pyem').todict())
-# from distutils.core import setup, Extension
-# from pyem import version as pyem_version
-#
-# # distutils does not update MANIFEST correctly, removes it
-# import os
-# if os.path.exists('MANIFEST'): os.remove('MANIFEST')
-# from os.path import join
-#
-# import re
-#
-# from numpy.distutils.misc_util import get_numpy_include_dirs
-# NUMPYINC = get_numpy_include_dirs()[0]
-#
-# # General variables:
-# # - DISTNAME: name of the distributed package
-# # - VERSION: the version reference is in pyem/__init__.py file
-# # - other upper cased variables are the same than the corresponding
-# # keywords in setup call
-# DISTNAME = 'pyem'
-# VERSION = pyem_version
-# DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
-# AUTHOR ='David Cournapeau',
-# AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
-# URL ='http://ar.media.kyoto-u.ac.jp/members/david',
-#
-# # Source files for extensions
-#
-# # Functions used to substitute values in File.
-# # Mainly use to replace config.h capabilities
-# def do_subst_in_file(sourcefile, targetfile, dict):
-# """Replace all instances of the keys of dict with their values.
-# For example, if dict is {'%VERSION%': '1.2345', '%BASE%': 'MyProg'},
-# then all instances of %VERSION% in the file will be replaced with 1.2345 etc.
-# """
-# try:
-# f = open(sourcefile, 'rb')
-# contents = f.read()
-# f.close()
-# except:
-# raise IOError, "Can't read source file %s"%sourcefile
-#
-# for (k,v) in dict.items():
-# contents = re.sub(k, v, contents)
-# try:
-# f = open(targetfile, 'wb')
-# f.write(contents)
-# f.close()
-# except:
-# raise IOError, "Can't read source file %s"%sourcefile
-# return 0 # success
-#
-# class SetupOption:
-# def __init__(self):
-# self.kmean = 'py'
-# self.ext_modules= [Extension(join('pyem', 'c_gden'),
-# sources=[join('pyem', 'src', 'c_gden.c')]) ]
-# self.cmdclass = {}
-# self.subsdic = {'%KMEANIMPORT%': []}
-#
-# def _config_kmean(self):
-# # Check in this order:
-# # - kmean in scipy.cluster,
-# # - custom vq with pyrex
-# # - custom pure python vq
-# #try:
-# # from scipy.cluster.vq import kmeans
-# # self.kmean = 'scipy'
-# # #self.subsdic['%KMEANIMPORT%'] = scipy_kmean
-# #except ImportError:
-# # try:
-# # from Pyrex.Distutils import build_ext
-# # self.kmean = 'pyrex'
-# # self.ext_modules.append(Extension('pyem/c_gmm',
-# # ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC]))
-# # self.cmdclass['build_ext'] = build_ext
-# # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean
-# # except ImportError:
-# # self.kmean = 'py'
-# # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean
-# try:
-# from Pyrex.Distutils import build_ext
-# self.kmean = 'pyrex'
-# self.ext_modules.append(Extension('pyem/c_gmm',
-# ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC]))
-# self.cmdclass['build_ext'] = build_ext
-# #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean
-# except ImportError:
-# self.kmean = 'py'
-# #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean
-# def setup(self):
-# self._config_kmean()
-# #import time
-# #do_subst_in_file('pyem/kmean.py.in', 'pyem/kmean.py', self.subsdic)
-# setup(name = DISTNAME,
-# version = VERSION,
-# description = DESCRIPTION,
-# author = AUTHOR,
-# author_email= AUTHOR_EMAIL,
-# url = URL,
-# packages = ['pyem', 'pyem.tests', 'pyem.profile_data'],
-# ext_modules = self.ext_modules,
-# cmdclass = self.cmdclass)
-#
-# stpobj = SetupOption()
-# stpobj.setup()
Deleted: trunk/Lib/sandbox/pyem/pyem/src/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/Makefile 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/Makefile 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,32 +0,0 @@
-CC = colorgcc
-LD = gcc
-
-PYREX = python2.4-pyrexc
-
-PYTHONINC = -I/usr/include/python2.4
-NUMPYINC = -I/usr/lib/python2.4/site-packages/numpy/core/include
-OPTIMS = -O3 -funroll-all-loops -march=pentium4 -msse2
-WARN = -W -Wall
-
-CFLAGS = $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
-
-c_gden.so: c_gden.o
- $(LD) -shared -o $@ $< -Wl,-soname,$@
-
-c_gden.o: c_gden.c
- $(CC) -c $(CFLAGS) -fPIC $<
-
-c_gmm.so: c_gmm.o
- $(LD) -shared -o $@ $< -Wl,-soname,$@
-
-c_gmm.o: c_gmm.c
- $(CC) -c $(CFLAGS) -fPIC $<
-
-c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
- $(PYREX) $<
-
-clean:
- rm -f c_gmm.c
- rm -f *.o
- rm -f *.so
- rm -f *.pyc
Deleted: trunk/Lib/sandbox/pyem/pyem/src/c_gden.c
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_gden.c 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_gden.c 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,37 +0,0 @@
-#include <stddef.h>
-#include <stdio.h>
-#include <math.h>
-
-/*
- * use va for caching, so its content is modified
- */
-int gden_diag(const double *in, const size_t n, const size_t d,
- const double* mu, double* va, double* out)
-{
- size_t j, i;
- double fac = 1.0;
- double acc, tmp;
-
- /*
- * Cache some precomputing
- */
- for(j = 0; j < d; ++j) {
- va[j] = 0.5/va[j];
- fac *= sqrt(va[j] / (M_PI) );
- va[j] *= -1.0;
- }
-
- /*
- * actual computing
- */
- for(i = 0; i < n; ++i) {
- acc = 0;
- for(j = 0; j < d; ++j) {
- tmp = (in[i *d + j] - mu[j]);
- acc += tmp * tmp * va[j];
- }
- out[i] = exp(acc) * fac;
- }
-
- return 0;
-}
Deleted: trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,66 +0,0 @@
-# Last Change: Thu Jul 13 04:00 PM 2006 J
-
-cimport c_numpy
-cimport c_python
-import numpy
-
-c_numpy.import_array()
-
-# pyrex version of _vq. Much faster in high dimension/high number of K
-# (ie K > 3-4)
-def _vq(data, init):
- (n, d) = data.shape
- label = numpy.zeros(n, int)
- _imp_vq(data, init, label)
-
- return label
-
-def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label):
- cdef int n
- cdef int d
- cdef int nc
- cdef int i
- cdef int j
- cdef int k
- cdef int *labeld
- cdef double *da, *code
- cdef double dist
- cdef double acc
-
- n = data.dimensions[0]
- d = data.dimensions[1]
- nc = init.dimensions[0]
-
- if not data.dtype == numpy.dtype(numpy.float64):
- print '_vq not (yet) implemented for dtype %s'%dtype.name
- return
- da = (<double*>data.data)
-
- if not init.dtype == numpy.dtype(numpy.float64):
- print '_vq not (yet) implemented for dtype %s'%dtype.name
- return
- code = (<double*>init.data)
-
- if not label.dtype == numpy.dtype(numpy.int32):
- print '_vq not (yet) implemented for dtype %s'%dtype.name
- return
- labeld = (<int*>label.data)
-
- for i from 0<=i<n:
- acc = 0
- lab = 0
- for j from 0<=j<d:
- acc = acc + (da[i * d + j] - code[j]) * \
- (da[i * d + j] - code[j])
- dist = acc
- for k from 1<=k<nc:
- acc = 0
- for j from 0<=j<d:
- acc = acc + (da[i * d + j] - code[k * d + j]) * \
- (da[i * d + j] - code[k * d + j])
- if acc < dist:
- dist = acc
- lab = k
- labeld[i] = lab
-
- return lab
Deleted: trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,59 +0,0 @@
-# :Author: Robert Kern
-# :Copyright: 2004, Enthought, Inc.
-# :License: BSD Style
-
-
-cdef extern from "numpy/arrayobject.h":
- ctypedef enum PyArray_TYPES:
- PyArray_BOOL
- PyArray_BYTE
- PyArray_UBYTE
- PyArray_SHORT
- PyArray_USHORT
- PyArray_INT
- PyArray_UINT
- PyArray_LONG
- PyArray_ULONG
- PyArray_LONGLONG
- PyArray_ULONGLONG
- PyArray_FLOAT
- PyArray_DOUBLE
- PyArray_LONGDOUBLE
- PyArray_CFLOAT
- PyArray_CDOUBLE
- PyArray_CLONGDOUBLE
- PyArray_OBJECT
- PyArray_STRING
- PyArray_UNICODE
- PyArray_VOID
- PyArray_NTYPES
- PyArray_NOTYPE
-
- ctypedef int intp
-
- ctypedef extern class numpy.dtype [object PyArray_Descr]:
- cdef int type_num, elsize, alignment
- cdef char type, kind, byteorder, hasobject
- cdef object fields, typeobj
-
- ctypedef extern class numpy.ndarray [object PyArrayObject]:
- cdef char *data
- cdef int nd
- cdef intp *dimensions
- cdef intp *strides
- cdef object base
- cdef dtype descr
- cdef int flags
-
- ndarray PyArray_SimpleNew(int ndims, intp* dims, int item_type)
- int PyArray_Check(object obj)
- ndarray PyArray_ContiguousFromObject(object obj, PyArray_TYPES type,
- int mindim, int maxdim)
- intp PyArray_SIZE(ndarray arr)
- void *PyArray_DATA(ndarray arr)
- ndarray PyArray_FromAny(object obj, dtype newtype, int mindim, int maxdim,
- int requirements, object context)
- ndarray PyArray_NewFromDescr(object subtype, dtype newtype, int nd, intp* dims,
- intp* strides, void* data, int flags, object parent)
-
- void import_array()
Deleted: trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,20 +0,0 @@
-# -*- Mode: Python -*- Not really, but close enough
-
-# Expose as much of the Python C API as we need here
-
-cdef extern from "stdlib.h":
- ctypedef int size_t
-
-cdef extern from "Python.h":
- ctypedef int Py_intptr_t
- void* PyMem_Malloc(size_t)
- void* PyMem_Realloc(void *p, size_t n)
- void PyMem_Free(void *p)
- char* PyString_AsString(object string)
- object PyString_FromString(char *v)
- object PyString_InternFromString(char *v)
- int PyErr_CheckSignals()
- object PyFloat_FromDouble(double v)
- void Py_XINCREF(object o)
- void Py_XDECREF(object o)
- void Py_CLEAR(object o) # use instead of decref
Deleted: trunk/Lib/sandbox/pyem/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/tests/test_densities.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/tests/test_densities.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,181 +0,0 @@
-#! /usr/bin/env python
-# Last Change: Fri Sep 29 06:00 PM 2006 J
-
-import sys
-from numpy.testing import *
-
-import numpy as N
-
-set_package_path()
-from pyem.densities import gauss_den
-from pyem._c_densities import gauss_den as c_gauss_den
-
-restore_path()
-
-#Optional:
-set_local_path()
-# import modules that are located in the same directory as this file.
-restore_path()
-
-class test_densities(NumpyTestCase):
- def _generate_test_data_1d(self):
- self.va = 2.0
- self.mu = 1.0
- self.X = N.linspace(-2, 2, 10)[:, N.newaxis]
-
- self.Yt = N.array([0.02973257230591, 0.05512079811082, 0.09257745306945,
- 0.14086453882683,
- 0.19418015562214, 0.24250166773127, 0.27436665745048, 0.28122547107069,
- 0.26114678964743, 0.21969564473386])
-
- def _generate_test_data_2d_diag(self):
- #============================
- # Small test in 2d (diagonal)
- #============================
- self.mu = N.atleast_2d([-1.0, 2.0])
- self.va = N.atleast_2d([2.0, 3.0])
-
- self.X = N.zeros((10, 2))
- self.X[:,0] = N.linspace(-2, 2, 10)
- self.X[:,1] = N.linspace(-1, 3, 10)
-
- self.Yt = N.array([0.01129091565384, 0.02025416837152, 0.03081845516786,
- 0.03977576221540, 0.04354490552910, 0.04043592581117,
- 0.03184994053539, 0.02127948225225, 0.01205937178755,
- 0.00579694938623 ])
-
-
- def _generate_test_data_2d_full(self):
- #============================
- # Small test in 2d (full mat)
- #============================
- self.mu = N.array([[0.2, -1.0]])
- self.va = N.array([[1.2, 0.1], [0.1, 0.5]])
- X1 = N.linspace(-2, 2, 10)[:, N.newaxis]
- X2 = N.linspace(-3, 3, 10)[:, N.newaxis]
- self.X = N.concatenate(([X1, X2]), 1)
-
- self.Yt = N.array([0.00096157109751, 0.01368908714856,
- 0.07380823191162, 0.15072050533842,
- 0.11656739937861, 0.03414436965525,
- 0.00378789836599, 0.00015915297541,
- 0.00000253261067, 0.00000001526368])
-
- def _check_py(self, level, decimal = 12):
- Y = gauss_den(self.X, self.mu, self.va)
- assert_array_almost_equal(Y, self.Yt, decimal)
-
- def _check_c(self, level, decimal = 12):
- Y = c_gauss_den(self.X, self.mu, self.va)
- assert_array_almost_equal(Y, self.Yt, decimal)
-
- def check_py_1d(self, level = 1):
- self._generate_test_data_1d()
- self._check_py(level)
-
- def check_py_2d_diag(self, level = 1):
- self._generate_test_data_2d_diag()
- self._check_py(level)
-
- def check_py_2d_full(self, level = 1):
- self._generate_test_data_2d_full()
- self._check_py(level)
-
- def check_c_1d(self, level = 1):
- self._generate_test_data_1d()
- self._check_c(level)
-
- def check_c_2d_diag(self, level = 1):
- self._generate_test_data_2d_diag()
- self._check_c(level)
-
- def check_c_2d_full(self, level = 1):
- self._generate_test_data_2d_full()
- self._check_c(level)
-
-if __name__ == "__main__":
- NumpyTest().run()
-
-# def generate_test_data(n, d, mode = 'diag', file='test.dat'):
-# """Generate a set of data of dimension d, with n frames,
-# that is input data, mean, var and output of gden, so that
-# other implementations can be tested against"""
-# mu = randn(1, d)
-# if mode == 'diag':
-# va = abs(randn(1, d))
-# elif mode == 'full':
-# va = randn(d, d)
-# va = dot(va, va.transpose())
-#
-# input = randn(n, d)
-# output = gauss_den(input, mu, va)
-#
-# import tables
-# h5file = tables.openFile(file, "w")
-#
-# h5file.createArray(h5file.root, 'input', input)
-# h5file.createArray(h5file.root, 'mu', mu)
-# h5file.createArray(h5file.root, 'va', va)
-# h5file.createArray(h5file.root, 'output', output)
-#
-# h5file.close()
-#
-# def test_gauss_den():
-# """"""
-# # import tables
-# # import numpy as N
-# #
-# # filename = 'dendata.h5'
-#
-# # # # Dimension 1
-# # # d = 1
-# # # mu = 1.0
-# # # va = 2.0
-#
-# # # X = randn(1e3, 1)
-#
-# # # Y = gauss_den(X, mu, va)
-#
-# # # h5file = tables.openFile(filename, "w")
-#
-# # # h5file.createArray(h5file.root, 'X', X)
-# # # h5file.createArray(h5file.root, 'mu', mu)
-# # # h5file.createArray(h5file.root, 'va', va)
-# # # h5file.createArray(h5file.root, 'Y', Y)
-#
-# # # h5file.close()
-#
-# # # # Dimension 2, diag
-# # # d = 2
-# # # mu = N.array([1.0, -2.0])
-# # # va = N.array([1.0, 2.0])
-#
-# # # X = randn(1e3, 2)
-#
-# # # Y = gauss_den(X, mu, va)
-#
-# # # h5file = tables.openFile(filename, "w")
-#
-# # # h5file.createArray(h5file.root, 'X', X)
-# # # h5file.createArray(h5file.root, 'mu', mu)
-# # # h5file.createArray(h5file.root, 'va', va)
-# # # h5file.createArray(h5file.root, 'Y', Y)
-#
-# # # Dimension 2, full
-# # d = 2
-# # mu = N.array([[0.2, -1.0]])
-# # va = N.array([[1.2, 0.1], [0.1, 0.5]])
-#
-# # X = randn(1e3, 2)
-#
-# # Y = gauss_den(X, mu, va)
-#
-# # h5file = tables.openFile(filename, "w")
-#
-# # h5file.createArray(h5file.root, 'X', X)
-# # h5file.createArray(h5file.root, 'mu', mu)
-# # h5file.createArray(h5file.root, 'va', va)
-# # h5file.createArray(h5file.root, 'Y', Y)
-#
-# # h5file.close()
-#
Deleted: trunk/Lib/sandbox/pyem/pyem/tests/test_kmean.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/tests/test_kmean.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/pyem/tests/test_kmean.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -1,46 +0,0 @@
-#! /usr/bin/env python
-# Last Change: Thu Sep 28 01:00 PM 2006 J
-
-import sys
-from numpy.testing import *
-
-import numpy as N
-
-set_package_path()
-from pyem.kmean import kmean
-restore_path()
-
-#Optional:
-set_local_path()
-# import modules that are located in the same directory as this file.
-restore_path()
-
-# Global data
-X = N.array([[3.0, 3], [4, 3], [4, 2],
- [9, 2], [5, 1], [6, 2], [9, 4],
- [5, 2], [5, 4], [7, 4], [6, 5]])
-
-codet1 = N.array([[3.0000, 3.0000],
- [6.2000, 4.0000],
- [5.8000, 1.8000]])
-
-codet2 = N.array([[11.0/3, 8.0/3],
- [6.7500, 4.2500],
- [6.2500, 1.7500]])
-
-class test_kmean(NumpyTestCase):
- def check_iter1(self, level=1):
- initc = N.concatenate(([[X[0]], [X[1]], [X[2]]]))
- code = initc.copy()
- code1 = kmean(X, code, 1)[0]
-
- assert_array_almost_equal(code1, codet1)
- def check_iter2(self, level=1):
- initc = N.concatenate(([[X[0]], [X[1]], [X[2]]]))
- code = initc.copy()
- code2 = kmean(X, code, 2)[0]
-
- assert_array_almost_equal(code2, codet2)
-
-if __name__ == "__main__":
- NumpyTest().run()
Added: trunk/Lib/sandbox/pyem/setup.py
===================================================================
--- trunk/Lib/sandbox/pyem/setup.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/setup.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,142 @@
+#! /usr/bin/env python
+# Last Change: Fri Oct 06 09:00 PM 2006 J
+# TODO:
+# - check how to handle cmd line build options with distutils and use
+# it in the building process
+
+""" pyem is a small python package to estimate Gaussian Mixtures Models
+from data, using Expectation Maximization"""
+
+from os.path import join
+from info import version as pyem_version
+
+DISTNAME = 'pyem'
+VERSION = pyem_version
+DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
+AUTHOR ='David Cournapeau',
+AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
+URL ='http://ar.media.kyoto-u.ac.jp/members/david',
+
+def configuration(parent_package='',top_path=None, package_name='pyem'):
+ from numpy.distutils.misc_util import Configuration
+ config = Configuration(package_name,parent_package,top_path,
+ version = VERSION)
+ config.add_data_dir('tests')
+ config.add_subpackage('profile_data')
+ config.add_extension('c_gden',
+ #define_macros=[('LIBSVM_EXPORTS', None),
+ # ('LIBSVM_DLL', None)],
+ sources=[join('src', 'c_gden.c')])
+
+ return config
+
+if __name__ == "__main__":
+ from numpy.distutils.core import setup
+ #setup(**configuration(top_path='').todict())
+ setup(**configuration(top_path='',
+ package_name='scipy.sandbox.pyem').todict())
+# from distutils.core import setup, Extension
+# from pyem import version as pyem_version
+#
+# # distutils does not update MANIFEST correctly, removes it
+# import os
+# if os.path.exists('MANIFEST'): os.remove('MANIFEST')
+# from os.path import join
+#
+# import re
+#
+# from numpy.distutils.misc_util import get_numpy_include_dirs
+# NUMPYINC = get_numpy_include_dirs()[0]
+#
+# # General variables:
+# # - DISTNAME: name of the distributed package
+# # - VERSION: the version reference is in pyem/__init__.py file
+# # - other upper cased variables are the same than the corresponding
+# # keywords in setup call
+# DISTNAME = 'pyem'
+# VERSION = pyem_version
+# DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
+# AUTHOR ='David Cournapeau',
+# AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
+# URL ='http://ar.media.kyoto-u.ac.jp/members/david',
+#
+# # Source files for extensions
+#
+# # Functions used to substitute values in File.
+# # Mainly use to replace config.h capabilities
+# def do_subst_in_file(sourcefile, targetfile, dict):
+# """Replace all instances of the keys of dict with their values.
+# For example, if dict is {'%VERSION%': '1.2345', '%BASE%': 'MyProg'},
+# then all instances of %VERSION% in the file will be replaced with 1.2345 etc.
+# """
+# try:
+# f = open(sourcefile, 'rb')
+# contents = f.read()
+# f.close()
+# except:
+# raise IOError, "Can't read source file %s"%sourcefile
+#
+# for (k,v) in dict.items():
+# contents = re.sub(k, v, contents)
+# try:
+# f = open(targetfile, 'wb')
+# f.write(contents)
+# f.close()
+# except:
+# raise IOError, "Can't read source file %s"%sourcefile
+# return 0 # success
+#
+# class SetupOption:
+# def __init__(self):
+# self.kmean = 'py'
+# self.ext_modules= [Extension(join('pyem', 'c_gden'),
+# sources=[join('pyem', 'src', 'c_gden.c')]) ]
+# self.cmdclass = {}
+# self.subsdic = {'%KMEANIMPORT%': []}
+#
+# def _config_kmean(self):
+# # Check in this order:
+# # - kmean in scipy.cluster,
+# # - custom vq with pyrex
+# # - custom pure python vq
+# #try:
+# # from scipy.cluster.vq import kmeans
+# # self.kmean = 'scipy'
+# # #self.subsdic['%KMEANIMPORT%'] = scipy_kmean
+# #except ImportError:
+# # try:
+# # from Pyrex.Distutils import build_ext
+# # self.kmean = 'pyrex'
+# # self.ext_modules.append(Extension('pyem/c_gmm',
+# # ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC]))
+# # self.cmdclass['build_ext'] = build_ext
+# # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean
+# # except ImportError:
+# # self.kmean = 'py'
+# # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean
+# try:
+# from Pyrex.Distutils import build_ext
+# self.kmean = 'pyrex'
+# self.ext_modules.append(Extension('pyem/c_gmm',
+# ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC]))
+# self.cmdclass['build_ext'] = build_ext
+# #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean
+# except ImportError:
+# self.kmean = 'py'
+# #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean
+# def setup(self):
+# self._config_kmean()
+# #import time
+# #do_subst_in_file('pyem/kmean.py.in', 'pyem/kmean.py', self.subsdic)
+# setup(name = DISTNAME,
+# version = VERSION,
+# description = DESCRIPTION,
+# author = AUTHOR,
+# author_email= AUTHOR_EMAIL,
+# url = URL,
+# packages = ['pyem', 'pyem.tests', 'pyem.profile_data'],
+# ext_modules = self.ext_modules,
+# cmdclass = self.cmdclass)
+#
+# stpobj = SetupOption()
+# stpobj.setup()
Added: trunk/Lib/sandbox/pyem/src/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/src/Makefile 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/Makefile 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,32 @@
+CC = colorgcc
+LD = gcc
+
+PYREX = python2.4-pyrexc
+
+PYTHONINC = -I/usr/include/python2.4
+NUMPYINC = -I/usr/lib/python2.4/site-packages/numpy/core/include
+OPTIMS = -O3 -funroll-all-loops -march=pentium4 -msse2
+WARN = -W -Wall
+
+CFLAGS = $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
+
+c_gden.so: c_gden.o
+ $(LD) -shared -o $@ $< -Wl,-soname,$@
+
+c_gden.o: c_gden.c
+ $(CC) -c $(CFLAGS) -fPIC $<
+
+c_gmm.so: c_gmm.o
+ $(LD) -shared -o $@ $< -Wl,-soname,$@
+
+c_gmm.o: c_gmm.c
+ $(CC) -c $(CFLAGS) -fPIC $<
+
+c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
+ $(PYREX) $<
+
+clean:
+ rm -f c_gmm.c
+ rm -f *.o
+ rm -f *.so
+ rm -f *.pyc
Added: trunk/Lib/sandbox/pyem/src/c_gden.c
===================================================================
--- trunk/Lib/sandbox/pyem/src/c_gden.c 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/c_gden.c 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,37 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <math.h>
+
+/*
+ * use va for caching, so its content is modified
+ */
+int gden_diag(const double *in, const size_t n, const size_t d,
+ const double* mu, double* va, double* out)
+{
+ size_t j, i;
+ double fac = 1.0;
+ double acc, tmp;
+
+ /*
+ * Cache some precomputing
+ */
+ for(j = 0; j < d; ++j) {
+ va[j] = 0.5/va[j];
+ fac *= sqrt(va[j] / (M_PI) );
+ va[j] *= -1.0;
+ }
+
+ /*
+ * actual computing
+ */
+ for(i = 0; i < n; ++i) {
+ acc = 0;
+ for(j = 0; j < d; ++j) {
+ tmp = (in[i *d + j] - mu[j]);
+ acc += tmp * tmp * va[j];
+ }
+ out[i] = exp(acc) * fac;
+ }
+
+ return 0;
+}
Added: trunk/Lib/sandbox/pyem/src/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/src/c_gmm.pyx 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/c_gmm.pyx 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,66 @@
+# Last Change: Thu Jul 13 04:00 PM 2006 J
+
+cimport c_numpy
+cimport c_python
+import numpy
+
+c_numpy.import_array()
+
+# pyrex version of _vq. Much faster in high dimension/high number of K
+# (ie K > 3-4)
+def _vq(data, init):
+ (n, d) = data.shape
+ label = numpy.zeros(n, int)
+ _imp_vq(data, init, label)
+
+ return label
+
+def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label):
+ cdef int n
+ cdef int d
+ cdef int nc
+ cdef int i
+ cdef int j
+ cdef int k
+ cdef int *labeld
+ cdef double *da, *code
+ cdef double dist
+ cdef double acc
+
+ n = data.dimensions[0]
+ d = data.dimensions[1]
+ nc = init.dimensions[0]
+
+ if not data.dtype == numpy.dtype(numpy.float64):
+ print '_vq not (yet) implemented for dtype %s'%dtype.name
+ return
+ da = (<double*>data.data)
+
+ if not init.dtype == numpy.dtype(numpy.float64):
+ print '_vq not (yet) implemented for dtype %s'%dtype.name
+ return
+ code = (<double*>init.data)
+
+ if not label.dtype == numpy.dtype(numpy.int32):
+ print '_vq not (yet) implemented for dtype %s'%dtype.name
+ return
+ labeld = (<int*>label.data)
+
+ for i from 0<=i<n:
+ acc = 0
+ lab = 0
+ for j from 0<=j<d:
+ acc = acc + (da[i * d + j] - code[j]) * \
+ (da[i * d + j] - code[j])
+ dist = acc
+ for k from 1<=k<nc:
+ acc = 0
+ for j from 0<=j<d:
+ acc = acc + (da[i * d + j] - code[k * d + j]) * \
+ (da[i * d + j] - code[k * d + j])
+ if acc < dist:
+ dist = acc
+ lab = k
+ labeld[i] = lab
+
+ return lab
Added: trunk/Lib/sandbox/pyem/src/c_numpy.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/src/c_numpy.pxd 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/c_numpy.pxd 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,59 @@
+# :Author: Robert Kern
+# :Copyright: 2004, Enthought, Inc.
+# :License: BSD Style
+
+
+cdef extern from "numpy/arrayobject.h":
+ ctypedef enum PyArray_TYPES:
+ PyArray_BOOL
+ PyArray_BYTE
+ PyArray_UBYTE
+ PyArray_SHORT
+ PyArray_USHORT
+ PyArray_INT
+ PyArray_UINT
+ PyArray_LONG
+ PyArray_ULONG
+ PyArray_LONGLONG
+ PyArray_ULONGLONG
+ PyArray_FLOAT
+ PyArray_DOUBLE
+ PyArray_LONGDOUBLE
+ PyArray_CFLOAT
+ PyArray_CDOUBLE
+ PyArray_CLONGDOUBLE
+ PyArray_OBJECT
+ PyArray_STRING
+ PyArray_UNICODE
+ PyArray_VOID
+ PyArray_NTYPES
+ PyArray_NOTYPE
+
+ ctypedef int intp
+
+ ctypedef extern class numpy.dtype [object PyArray_Descr]:
+ cdef int type_num, elsize, alignment
+ cdef char type, kind, byteorder, hasobject
+ cdef object fields, typeobj
+
+ ctypedef extern class numpy.ndarray [object PyArrayObject]:
+ cdef char *data
+ cdef int nd
+ cdef intp *dimensions
+ cdef intp *strides
+ cdef object base
+ cdef dtype descr
+ cdef int flags
+
+ ndarray PyArray_SimpleNew(int ndims, intp* dims, int item_type)
+ int PyArray_Check(object obj)
+ ndarray PyArray_ContiguousFromObject(object obj, PyArray_TYPES type,
+ int mindim, int maxdim)
+ intp PyArray_SIZE(ndarray arr)
+ void *PyArray_DATA(ndarray arr)
+ ndarray PyArray_FromAny(object obj, dtype newtype, int mindim, int maxdim,
+ int requirements, object context)
+ ndarray PyArray_NewFromDescr(object subtype, dtype newtype, int nd, intp* dims,
+ intp* strides, void* data, int flags, object parent)
+
+ void import_array()
Added: trunk/Lib/sandbox/pyem/src/c_python.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/src/c_python.pxd 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/src/c_python.pxd 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,20 @@
+# -*- Mode: Python -*- Not really, but close enough
+
+# Expose as much of the Python C API as we need here
+
+cdef extern from "stdlib.h":
+ ctypedef int size_t
+
+cdef extern from "Python.h":
+ ctypedef int Py_intptr_t
+ void* PyMem_Malloc(size_t)
+ void* PyMem_Realloc(void *p, size_t n)
+ void PyMem_Free(void *p)
+ char* PyString_AsString(object string)
+ object PyString_FromString(char *v)
+ object PyString_InternFromString(char *v)
+ int PyErr_CheckSignals()
+ object PyFloat_FromDouble(double v)
+ void Py_XINCREF(object o)
+ void Py_XDECREF(object o)
+ void Py_CLEAR(object o) # use instead of decref
Added: trunk/Lib/sandbox/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_densities.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/tests/test_densities.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,181 @@
+#! /usr/bin/env python
+# Last Change: Fri Sep 29 06:00 PM 2006 J
+
+import sys
+from numpy.testing import *
+
+import numpy as N
+
+set_package_path()
+from pyem.densities import gauss_den
+from pyem._c_densities import gauss_den as c_gauss_den
+
+restore_path()
+
+#Optional:
+set_local_path()
+# import modules that are located in the same directory as this file.
+restore_path()
+
+class test_densities(NumpyTestCase):
+ def _generate_test_data_1d(self):
+ self.va = 2.0
+ self.mu = 1.0
+ self.X = N.linspace(-2, 2, 10)[:, N.newaxis]
+
+ self.Yt = N.array([0.02973257230591, 0.05512079811082, 0.09257745306945,
+ 0.14086453882683,
+ 0.19418015562214, 0.24250166773127, 0.27436665745048, 0.28122547107069,
+ 0.26114678964743, 0.21969564473386])
+
+ def _generate_test_data_2d_diag(self):
+ #============================
+ # Small test in 2d (diagonal)
+ #============================
+ self.mu = N.atleast_2d([-1.0, 2.0])
+ self.va = N.atleast_2d([2.0, 3.0])
+
+ self.X = N.zeros((10, 2))
+ self.X[:,0] = N.linspace(-2, 2, 10)
+ self.X[:,1] = N.linspace(-1, 3, 10)
+
+ self.Yt = N.array([0.01129091565384, 0.02025416837152, 0.03081845516786,
+ 0.03977576221540, 0.04354490552910, 0.04043592581117,
+ 0.03184994053539, 0.02127948225225, 0.01205937178755,
+ 0.00579694938623 ])
+
+
+ def _generate_test_data_2d_full(self):
+ #============================
+ # Small test in 2d (full mat)
+ #============================
+ self.mu = N.array([[0.2, -1.0]])
+ self.va = N.array([[1.2, 0.1], [0.1, 0.5]])
+ X1 = N.linspace(-2, 2, 10)[:, N.newaxis]
+ X2 = N.linspace(-3, 3, 10)[:, N.newaxis]
+ self.X = N.concatenate(([X1, X2]), 1)
+
+ self.Yt = N.array([0.00096157109751, 0.01368908714856,
+ 0.07380823191162, 0.15072050533842,
+ 0.11656739937861, 0.03414436965525,
+ 0.00378789836599, 0.00015915297541,
+ 0.00000253261067, 0.00000001526368])
+
+ def _check_py(self, level, decimal = 12):
+ Y = gauss_den(self.X, self.mu, self.va)
+ assert_array_almost_equal(Y, self.Yt, decimal)
+
+ def _check_c(self, level, decimal = 12):
+ Y = c_gauss_den(self.X, self.mu, self.va)
+ assert_array_almost_equal(Y, self.Yt, decimal)
+
+ def check_py_1d(self, level = 1):
+ self._generate_test_data_1d()
+ self._check_py(level)
+
+ def check_py_2d_diag(self, level = 1):
+ self._generate_test_data_2d_diag()
+ self._check_py(level)
+
+ def check_py_2d_full(self, level = 1):
+ self._generate_test_data_2d_full()
+ self._check_py(level)
+
+ def check_c_1d(self, level = 1):
+ self._generate_test_data_1d()
+ self._check_c(level)
+
+ def check_c_2d_diag(self, level = 1):
+ self._generate_test_data_2d_diag()
+ self._check_c(level)
+
+ def check_c_2d_full(self, level = 1):
+ self._generate_test_data_2d_full()
+ self._check_c(level)
+
+if __name__ == "__main__":
+ NumpyTest().run()
+
+# def generate_test_data(n, d, mode = 'diag', file='test.dat'):
+# """Generate a set of data of dimension d, with n frames,
+# that is input data, mean, var and output of gden, so that
+# other implementations can be tested against"""
+# mu = randn(1, d)
+# if mode == 'diag':
+# va = abs(randn(1, d))
+# elif mode == 'full':
+# va = randn(d, d)
+# va = dot(va, va.transpose())
+#
+# input = randn(n, d)
+# output = gauss_den(input, mu, va)
+#
+# import tables
+# h5file = tables.openFile(file, "w")
+#
+# h5file.createArray(h5file.root, 'input', input)
+# h5file.createArray(h5file.root, 'mu', mu)
+# h5file.createArray(h5file.root, 'va', va)
+# h5file.createArray(h5file.root, 'output', output)
+#
+# h5file.close()
+#
+# def test_gauss_den():
+# """"""
+# # import tables
+# # import numpy as N
+# #
+# # filename = 'dendata.h5'
+#
+# # # # Dimension 1
+# # # d = 1
+# # # mu = 1.0
+# # # va = 2.0
+#
+# # # X = randn(1e3, 1)
+#
+# # # Y = gauss_den(X, mu, va)
+#
+# # # h5file = tables.openFile(filename, "w")
+#
+# # # h5file.createArray(h5file.root, 'X', X)
+# # # h5file.createArray(h5file.root, 'mu', mu)
+# # # h5file.createArray(h5file.root, 'va', va)
+# # # h5file.createArray(h5file.root, 'Y', Y)
+#
+# # # h5file.close()
+#
+# # # # Dimension 2, diag
+# # # d = 2
+# # # mu = N.array([1.0, -2.0])
+# # # va = N.array([1.0, 2.0])
+#
+# # # X = randn(1e3, 2)
+#
+# # # Y = gauss_den(X, mu, va)
+#
+# # # h5file = tables.openFile(filename, "w")
+#
+# # # h5file.createArray(h5file.root, 'X', X)
+# # # h5file.createArray(h5file.root, 'mu', mu)
+# # # h5file.createArray(h5file.root, 'va', va)
+# # # h5file.createArray(h5file.root, 'Y', Y)
+#
+# # # Dimension 2, full
+# # d = 2
+# # mu = N.array([[0.2, -1.0]])
+# # va = N.array([[1.2, 0.1], [0.1, 0.5]])
+#
+# # X = randn(1e3, 2)
+#
+# # Y = gauss_den(X, mu, va)
+#
+# # h5file = tables.openFile(filename, "w")
+#
+# # h5file.createArray(h5file.root, 'X', X)
+# # h5file.createArray(h5file.root, 'mu', mu)
+# # h5file.createArray(h5file.root, 'va', va)
+# # h5file.createArray(h5file.root, 'Y', Y)
+#
+# # h5file.close()
+#
Added: trunk/Lib/sandbox/pyem/tests/test_kmean.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_kmean.py 2006-10-12 13:48:48 UTC (rev 2284)
+++ trunk/Lib/sandbox/pyem/tests/test_kmean.py 2006-10-12 13:48:58 UTC (rev 2285)
@@ -0,0 +1,46 @@
+#! /usr/bin/env python
+# Last Change: Thu Sep 28 01:00 PM 2006 J
+
+import sys
+from numpy.testing import *
+
+import numpy as N
+
+set_package_path()
+from pyem.kmean import kmean
+restore_path()
+
+#Optional:
+set_local_path()
+# import modules that are located in the same directory as this file.
+restore_path()
+
+# Global data
+X = N.array([[3.0, 3], [4, 3], [4, 2],
+ [9, 2], [5, 1], [6, 2], [9, 4],
+ [5, 2], [5, 4], [7, 4], [6, 5]])
+
+codet1 = N.array([[3.0000, 3.0000],
+ [6.2000, 4.0000],
+ [5.8000, 1.8000]])
+
+codet2 = N.array([[11.0/3, 8.0/3],
+ [6.7500, 4.2500],
+ [6.2500, 1.7500]])
+
+class test_kmean(NumpyTestCase):
+ def check_iter1(self, level=1):
+ initc = N.concatenate(([[X[0]], [X[1]], [X[2]]]))
+ code = initc.copy()
+ code1 = kmean(X, code, 1)[0]
+
+ assert_array_almost_equal(code1, codet1)
+ def check_iter2(self, level=1):
+ initc = N.concatenate(([[X[0]], [X[1]], [X[2]]]))
+ code = initc.copy()
+ code2 = kmean(X, code, 2)[0]
+
+ assert_array_almost_equal(code2, codet2)
+
+if __name__ == "__main__":
+ NumpyTest().run()
More information about the Scipy-svn
mailing list