[Scipy-svn] r2266 - trunk/Lib/sandbox/pyem
scipy-svn at scipy.org
scipy-svn at scipy.org
Thu Oct 12 09:45:12 EDT 2006
Author: cdavid
Date: 2006-10-12 08:45:05 -0500 (Thu, 12 Oct 2006)
New Revision: 2266
Added:
trunk/Lib/sandbox/pyem/Makefile
trunk/Lib/sandbox/pyem/c_gmm.pyx
trunk/Lib/sandbox/pyem/c_numpy.pxd
trunk/Lib/sandbox/pyem/c_python.pxd
trunk/Lib/sandbox/pyem/densities.py
trunk/Lib/sandbox/pyem/gmm_em.py
Log:
[pyem @ david at ar.media.kyoto-u.ac.jp-20060608085958-db646b48140fb363]
Initial commit
David Cournapeau <david at ar.media.kyoto-u.ac.jp> | 2006-06-08 17:59:58 +0900 (Thu, 08 Jun 2006)
Added: trunk/Lib/sandbox/pyem/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/Makefile 2006-10-12 13:42:53 UTC (rev 2265)
+++ trunk/Lib/sandbox/pyem/Makefile 2006-10-12 13:45:05 UTC (rev 2266)
@@ -0,0 +1,26 @@
+CC = colorgcc
+LD = gcc
+
+PYREX = python2.4-pyrexc
+
+PYTHONINC = -I/usr/include/python2.4
+NUMPYINC = -I/usr/lib/python2.4/site-packages/numpy/core/include
+OPTIMS = -O3 -ffast-math -march=pentium4
+WARN = -W -Wall
+
+CFLAGS = $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
+
+c_gmm.so: c_gmm.o
+ $(LD) -shared -o $@ $< -Wl,-soname,$@
+
+c_gmm.o: c_gmm.c
+ $(CC) -c $(CFLAGS) -fPIC $<
+
+c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
+ $(PYREX) $<
+
+clean:
+ rm -f c_gmm.c
+ rm -f *.o
+ rm -f *.so
+ rm -f *.pyc
Added: trunk/Lib/sandbox/pyem/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/c_gmm.pyx 2006-10-12 13:42:53 UTC (rev 2265)
+++ trunk/Lib/sandbox/pyem/c_gmm.pyx 2006-10-12 13:45:05 UTC (rev 2266)
@@ -0,0 +1,66 @@
+# Last Change: Tue May 16 11:00 AM 2006 J
+
+cimport c_numpy
+cimport c_python
+import numpy
+
+c_numpy.import_array()
+
+# pyrex version of _vq. Much faster in high dimension/high number of K
+# (ie more than 3-4)
+def _vq(data, init):
+ (n, d) = data.shape
+ label = numpy.zeros(n, int)
+ _imp_vq(data, init, label)
+
+ return label
+
+def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label):
+ cdef int n
+ cdef int d
+ cdef int nc
+ cdef int i
+ cdef int j
+ cdef int k
+ cdef int *labeld
+ cdef double *da, *code
+ cdef double dist
+ cdef double acc
+
+ n = data.dimensions[0]
+ d = data.dimensions[1]
+ nc = init.dimensions[0]
+
+ if not data.dtype == numpy.dtype(numpy.float64):
+ print '_vq not (yet) implemented for dtype %s'%dtype.name
+ return
+ da = (<double*>data.data)
+
+ if not init.dtype == numpy.dtype(numpy.float64):
+ print '_vq not (yet) implemented for dtype %s'%dtype.name
+ return
+ code = (<double*>init.data)
+
+ if not label.dtype == numpy.dtype(numpy.int32):
+ print '_vq not (yet) implemented for dtype %s'%dtype.name
+ return
+ labeld = (<int*>label.data)
+
+ for i from 0<=i<n:
+ acc = 0
+ lab = 0
+ for j from 0<=j<d:
+ acc = acc + (da[i * d + j] - code[j]) * \
+ (da[i * d + j] - code[j])
+ dist = acc
+ for k from 1<=k<nc:
+ acc = 0
+ for j from 0<=j<d:
+ acc = acc + (da[i * d + j] - code[k * d + j]) * \
+ (da[i * d + j] - code[k * d + j])
+ if acc < dist:
+ dist = acc
+ lab = k
+ labeld[i] = lab
+
+ return lab
Added: trunk/Lib/sandbox/pyem/c_numpy.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/c_numpy.pxd 2006-10-12 13:42:53 UTC (rev 2265)
+++ trunk/Lib/sandbox/pyem/c_numpy.pxd 2006-10-12 13:45:05 UTC (rev 2266)
@@ -0,0 +1,59 @@
+# :Author: Robert Kern
+# :Copyright: 2004, Enthought, Inc.
+# :License: BSD Style
+
+
+cdef extern from "numpy/arrayobject.h":
+ ctypedef enum PyArray_TYPES:
+ PyArray_BOOL
+ PyArray_BYTE
+ PyArray_UBYTE
+ PyArray_SHORT
+ PyArray_USHORT
+ PyArray_INT
+ PyArray_UINT
+ PyArray_LONG
+ PyArray_ULONG
+ PyArray_LONGLONG
+ PyArray_ULONGLONG
+ PyArray_FLOAT
+ PyArray_DOUBLE
+ PyArray_LONGDOUBLE
+ PyArray_CFLOAT
+ PyArray_CDOUBLE
+ PyArray_CLONGDOUBLE
+ PyArray_OBJECT
+ PyArray_STRING
+ PyArray_UNICODE
+ PyArray_VOID
+ PyArray_NTYPES
+ PyArray_NOTYPE
+
+ ctypedef int intp
+
+ ctypedef extern class numpy.dtype [object PyArray_Descr]:
+ cdef int type_num, elsize, alignment
+ cdef char type, kind, byteorder, hasobject
+ cdef object fields, typeobj
+
+ ctypedef extern class numpy.ndarray [object PyArrayObject]:
+ cdef char *data
+ cdef int nd
+ cdef intp *dimensions
+ cdef intp *strides
+ cdef object base
+ cdef dtype descr
+ cdef int flags
+
+ ndarray PyArray_SimpleNew(int ndims, intp* dims, int item_type)
+ int PyArray_Check(object obj)
+ ndarray PyArray_ContiguousFromObject(object obj, PyArray_TYPES type,
+ int mindim, int maxdim)
+ intp PyArray_SIZE(ndarray arr)
+ void *PyArray_DATA(ndarray arr)
+ ndarray PyArray_FromAny(object obj, dtype newtype, int mindim, int maxdim,
+ int requirements, object context)
+ ndarray PyArray_NewFromDescr(object subtype, dtype newtype, int nd, intp* dims,
+ intp* strides, void* data, int flags, object parent)
+
+ void import_array()
Added: trunk/Lib/sandbox/pyem/c_python.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/c_python.pxd 2006-10-12 13:42:53 UTC (rev 2265)
+++ trunk/Lib/sandbox/pyem/c_python.pxd 2006-10-12 13:45:05 UTC (rev 2266)
@@ -0,0 +1,20 @@
+# -*- Mode: Python -*- Not really, but close enough
+
+# Expose as much of the Python C API as we need here
+
+cdef extern from "stdlib.h":
+ ctypedef int size_t
+
+cdef extern from "Python.h":
+ ctypedef int Py_intptr_t
+ void* PyMem_Malloc(size_t)
+ void* PyMem_Realloc(void *p, size_t n)
+ void PyMem_Free(void *p)
+ char* PyString_AsString(object string)
+ object PyString_FromString(char *v)
+ object PyString_InternFromString(char *v)
+ int PyErr_CheckSignals()
+ object PyFloat_FromDouble(double v)
+ void Py_XINCREF(object o)
+ void Py_XDECREF(object o)
+ void Py_CLEAR(object o) # use instead of decref
Added: trunk/Lib/sandbox/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/densities.py 2006-10-12 13:42:53 UTC (rev 2265)
+++ trunk/Lib/sandbox/pyem/densities.py 2006-10-12 13:45:05 UTC (rev 2266)
@@ -0,0 +1,375 @@
+#! /usr/bin/python
+#
+# Copyrighted David Cournapeau
+# Last Change: Mon May 29 01:00 PM 2006 J
+
+import numpy as N
+import numpy.linalg as lin
+
+# Error classes
+class DenError(Exception):
+ """Base class for exceptions in this module.
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error"""
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+# The following function do all the fancy stuff to check that parameters
+# are Ok, and call the right implementation if args are OK.
+def gauss_den(x, mu, va, log = False):
+ """ Compute multivariate Gaussian density at points x for
+ mean mu and variance va.
+
+ Vector are row vectors, except va which can be a matrix
+ (row vector variance for diagonal variance)
+
+ If log is True, than the log density is returned
+ (useful for underflow ?)"""
+ mu = N.atleast_2d(mu)
+ va = N.atleast_2d(va)
+ x = N.atleast_2d(x)
+
+ #=======================#
+ # Checking parameters #
+ #=======================#
+ if len(N.shape(mu)) != 2:
+ raise DenError("mu is not rank 2")
+
+ if len(N.shape(va)) != 2:
+ raise DenError("va is not rank 2")
+
+ if len(N.shape(x)) != 2:
+ raise DenError("x is not rank 2")
+
+ (n, d) = x.shape
+ (dm0, dm1) = mu.shape
+ (dv0, dv1) = va.shape
+
+ # Check x and mu same dimension
+ if dm0 != 1:
+ msg = "mean must be a row vector!"
+ raise DenError(msg)
+ if dm1 != d:
+ msg = "x and mu not same dim"
+ raise DenError(msg)
+ # Check va and mu same size
+ if dv1 != d:
+ msg = "mu and va not same dim"
+ raise DenError(msg)
+ if dv0 != 1 and dv0 != d:
+ msg = "va not square"
+ raise DenError(msg)
+
+ #===============#
+ # Computation #
+ #===============#
+ data = x - mu
+
+ if d == 1:
+ # scalar case
+ return _scalar_gauss_den(data[:, 0], mu[0, 0], va[0, 0], log)
+ elif dv0 == 1:
+ # Diagonal matrix case
+ return _diag_gauss_den(data, mu, va, log)
+ elif dv1 == dv0:
+ # full case
+ return _full_gauss_den(data, mu, va, log)
+ else:
+ raise DenError("variance mode not recognized, this is a bug")
+
+# Those 3 functions do almost all the actual computation
+def _scalar_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ ** Expect centered data (ie with mean removed) **
+
+ Call gauss_den instead"""
+ d = mu.size
+ inva = 1/va
+ fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+ y = (x ** 2) * -0.5 * inva
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + log(fac)
+
+ return y
+
+def _diag_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ ** Expect centered data (ie with mean removed) **
+
+ Call gauss_den instead"""
+ # Diagonal matrix case
+ d = mu.size
+ y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
+ if not log:
+ for i in range(1, d):
+ y *= _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+ else:
+ for i in range(1, d):
+ y += _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+ return y
+
+def _full_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in full matrix case.
+
+ It assumes all args are conformant, so it should
+ not be used directly Call gauss_den instead
+
+ ** Expect centered data (ie with mean removed) **
+
+ Does not check if va is definite positive (on inversible
+ for that matter), so the inverse computation and/or determinant
+ would throw an exception."""
+ d = mu.size
+ inva = lin.inv(va)
+ fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
+
+ # # Slow version
+ # n = N.size(x, 0)
+ # y = N.zeros(n, float)
+ # for i in range(n):
+ # y[i] = N.matrixmultiply(x[i,:],
+ # N.matrixmultiply(inva, N.transpose(x[i,:])))
+ # y *= -0.5
+
+ # we are using a trick with sum to "emulate"
+ # the matrix multiplication inva * x without any explicit loop
+ y = N.matrixmultiply(x, inva)
+ y = -0.5 * N.sum(y * x, 1)
+
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + N.log(fac)
+
+ return y
+
+# To plot a confidence ellipse from multi-variate gaussian pdf
+def gauss_ell(mu, va, dim = [0, 1], npoints = 100):
+ """ Given a mean and covariance for multi-variate
+ gaussian, returns npoints points for the ellipse
+ of confidence 0.39
+
+ Returns the coordinate x and y of the ellipse"""
+
+ # TODO: Get a confidence interval using the Chi2 distribution
+ # of points at a given mahalanobis distance...
+ mu = N.atleast_1d(mu)
+ va = N.atleast_1d(va)
+ c = N.array(dim)
+
+ if mu.size == va.size:
+ mode = 'diag'
+ else:
+ if va.ndim == 2:
+ if va.shape[0] == va.shape[1]:
+ mode = 'full'
+ else:
+ raise DenError("variance not square")
+ else:
+ raise DenError("mean and variance are not dim conformant")
+
+ level = 0.39
+
+ # Generates a circle of npoints
+ theta = N.linspace(0, 2 * N.pi, npoints)
+ circle = N.array([N.cos(theta), N.sin(theta)])
+
+ # Get the dimension which we are interested in:
+ mu = mu[dim]
+ if mode == 'diag':
+ va = va[dim]
+ elps = N.outerproduct(mu, N.ones(npoints, float))
+ elps += N.matrixmultiply(N.diag(N.sqrt(va)), circle)
+ elif mode == 'full':
+ va = va[c,:][:,c]
+ # Method: compute the cholesky decomp of each cov matrix, that is
+ # compute cova such as va = cova * cova'
+ # WARN: scipy is different than matlab here, as scipy computes a lower
+ # triangular cholesky decomp:
+ # - va = cova * cova' (scipy)
+ # - va = cova' * cova (matlab)
+ # So take care when comparing results with matlab !
+ cova = lin.cholesky(va)
+ elps = N.outerproduct(mu, N.ones(npoints, float))
+ elps += N.matrixmultiply(cova, circle)
+ else:
+ raise DenParam("var mode not recognized")
+
+ return elps[0, :], elps[1, :]
+
+def test_gauss_den():
+ """"""
+ # import tables
+ # import numpy as N
+ #
+ # filename = 'dendata.h5'
+
+ # # # Dimension 1
+ # # d = 1
+ # # mu = 1.0
+ # # va = 2.0
+
+ # # X = N.randn(1e3, 1)
+
+ # # Y = gauss_den(X, mu, va)
+
+ # # h5file = tables.openFile(filename, "w")
+
+ # # h5file.createArray(h5file.root, 'X', X)
+ # # h5file.createArray(h5file.root, 'mu', mu)
+ # # h5file.createArray(h5file.root, 'va', va)
+ # # h5file.createArray(h5file.root, 'Y', Y)
+
+ # # h5file.close()
+
+ # # # Dimension 2, diag
+ # # d = 2
+ # # mu = N.array([1.0, -2.0])
+ # # va = N.array([1.0, 2.0])
+
+ # # X = N.randn(1e3, 2)
+
+ # # Y = gauss_den(X, mu, va)
+
+ # # h5file = tables.openFile(filename, "w")
+
+ # # h5file.createArray(h5file.root, 'X', X)
+ # # h5file.createArray(h5file.root, 'mu', mu)
+ # # h5file.createArray(h5file.root, 'va', va)
+ # # h5file.createArray(h5file.root, 'Y', Y)
+
+ # # Dimension 2, full
+ # d = 2
+ # mu = N.array([[0.2, -1.0]])
+ # va = N.array([[1.2, 0.1], [0.1, 0.5]])
+
+ # X = N.randn(1e3, 2)
+
+ # Y = gauss_den(X, mu, va)
+
+ # h5file = tables.openFile(filename, "w")
+
+ # h5file.createArray(h5file.root, 'X', X)
+ # h5file.createArray(h5file.root, 'mu', mu)
+ # h5file.createArray(h5file.root, 'va', va)
+ # h5file.createArray(h5file.root, 'Y', Y)
+
+ # h5file.close()
+
+ import numpy.testing as testing
+ #=================
+ # Small test in 1d
+ #=================
+ va = 2.0
+ mu = 1.0
+ X = N.linspace(-2, 2, 10)[:, N.NewAxis]
+
+ Yt = N.array([0.02973257230591, 0.05512079811082, 0.09257745306945,
+ 0.14086453882683,
+ 0.19418015562214, 0.24250166773127, 0.27436665745048, 0.28122547107069,
+ 0.26114678964743, 0.21969564473386])
+
+ Y = gauss_den(X, mu, va)
+ try:
+ testing.assert_array_almost_equal(Y, Yt)
+ print "1d test succeded"
+ except AssertionError:
+ print "test fails in 1d"
+
+ #============================
+ # Small test in 2d (diagonal)
+ #============================
+ mu = N.atleast_2d([-1.0, 2.0])
+ va = N.atleast_2d([2.0, 3.0])
+ X1 = N.linspace(-2, 2, 10)[:, N.NewAxis]
+ X2 = N.linspace(-1, 3, 10)[:, N.NewAxis]
+ X = N.concatenate(([X1, X2]), 1)
+
+ Yt = N.array([0.01129091565384, 0.02025416837152, 0.03081845516786,
+ 0.03977576221540, 0.04354490552910, 0.04043592581117,
+ 0.03184994053539, 0.02127948225225, 0.01205937178755,
+ 0.00579694938623 ])
+
+ Y = gauss_den(X, mu, va)
+ try:
+ testing.assert_array_almost_equal(Y, Yt)
+ print "2d diag test succeded"
+ except AssertionError:
+ print "test fails in 2d diag"
+
+ #============================
+ # Small test in 2d (full mat)
+ #============================
+ mu = N.array([[0.2, -1.0]])
+ va = N.array([[1.2, 0.1], [0.1, 0.5]])
+ X1 = N.linspace(-2, 2, 10)[:, N.NewAxis]
+ X2 = N.linspace(-3, 3, 10)[:, N.NewAxis]
+ X = N.concatenate(([X1, X2]), 1)
+
+ Yt = N.array([0.00096157109751, 0.01368908714856,
+ 0.07380823191162, 0.15072050533842,
+ 0.11656739937861, 0.03414436965525,
+ 0.00378789836599, 0.00015915297541,
+ 0.00000253261067, 0.00000001526368])
+
+ Y = gauss_den(X, mu, va)
+ try:
+ testing.assert_array_almost_equal(Y, Yt)
+ print "2d full test succeded"
+ except AssertionError:
+ print "test fails in 2d full"
+
+
+if __name__ == "__main__":
+ import pylab
+
+ #=========================================
+ # Test plotting a simple diag 2d variance:
+ #=========================================
+ va = N.array([5, 3])
+ mu = N.array([2, 3])
+
+ # Generate a multivariate gaussian of mean mu and covariance va
+ X = N.randn(1e3, 2)
+ Yc = N.matrixmultiply(N.diag(N.sqrt(va)), X.transpose())
+ Yc = Yc.transpose() + mu
+
+ # Plotting
+ Xe, Ye = gauss_ell(mu, va, npoints = 100)
+ pylab.figure()
+ pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+ pylab.plot(Xe, Ye, 'r')
+
+ #=========================================
+ # Test plotting a simple full 2d variance:
+ #=========================================
+ va = N.array([[0.2, 0.1],[0.1, 0.5]])
+ mu = N.array([0, 3])
+
+ # Generate a multivariate gaussian of mean mu and covariance va
+ X = N.randn(1e3, 2)
+ Yc = N.matrixmultiply(lin.cholesky(va), X.transpose())
+ Yc = Yc.transpose() + mu
+
+ # Plotting
+ Xe, Ye = gauss_ell(mu, va, npoints = 100)
+ pylab.figure()
+ pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+ pylab.plot(Xe, Ye, 'r')
+ pylab.show()
+
+ savefig('example.png')
Added: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py 2006-10-12 13:42:53 UTC (rev 2265)
+++ trunk/Lib/sandbox/pyem/gmm_em.py 2006-10-12 13:45:05 UTC (rev 2266)
@@ -0,0 +1,734 @@
+# /usr/bin/python
+# Last Change: Thu May 25 03:00 PM 2006 J
+
+# TODO:
+# - interface with libem
+# - implements E and M in batch mode for mixtures, with special case for gmm
+# ( general EM should be outside this module. This one should be GMM specific, maybe)
+# - implements E and M in online mode
+# - python has list. Does it make sense to keep the h2m format for mean, va, when
+# we can use list instead ?
+# - a Gaussian Mixture Model should be a class, really !
+
+import numpy as N
+import numpy.linalg as lin
+import densities
+
+MAX_DEV = 1e-10
+
+# Error classes
+class GmmError(Exception):
+ """Base class for exceptions in this module."""
+ pass
+
+class GmmParamError(GmmError):
+ """Exception raised for errors in gmm params
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error
+ """
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+
+# Function to generate a GMM, or valid parameters for GMM
+def gen_rand_index(p, n):
+ """Generate a N samples vector containing random index between 1
+ and length(p), each index i with probability p(i)"""
+ # TODO Check args here
+
+ # TODO: check each value of inverse distribution is
+ # different
+ invcdf = N.cumsum(p)
+ uni = N.rand(n)
+ index = N.zeros(n)
+
+ # This one should be a bit faster
+ for k in range(len(p)-1, 0, -1):
+ blop = N.where(N.logical_and(invcdf[k-1] <= uni,
+ uni < invcdf[k]))
+ index[blop] = k
+
+ return index
+
+def gen_gmm(w, mu, va, n):
+ """Generate a gaussiam mixture model with weights w,
+ mean mu and variances va. Each column of the parameters
+ are one component parameter.
+ """
+ # Check args
+ K, d, mode = check_gmm_param(w, mu, va)
+
+ # Generate the mixture
+ S = gen_rand_index(w, n) # State index (ie hidden var)
+ X = N.randn(n, d) # standard gaussian
+
+ if mode == 'diag':
+ X = mu[S, :] + X * N.sqrt(va[S,:])
+ elif mode == 'full':
+ # Naive implementation
+ # cho = N.zeros(va.shape, float)
+ # for k in range(K):
+ # # Using cholesky is more stable than sqrtm; sqrtm is not
+ # # available in numpy anyway, only in scipy...
+ # cho[k*d:k*d+d,:] = lin.cholesky(va[k*d:k*d+d,:])
+ # for i in range(n):
+ # X[i,:] = mu[S[i],:] + N.matrixmultiply(cho[S[i]*d:S[i]*d+d,:], X[i,:])
+
+ # Faster:
+ cho = N.zeros((K, va.shape[1], va.shape[1]), float)
+ for k in range(K):
+ # Using cholesky is more stable than sqrtm; sqrtm is not
+ # available in numpy anyway, only in scipy...
+ cho[k] = lin.cholesky(va[k*d:k*d+d,:])
+
+ for s in range(K):
+ tmpind = N.where(S == s)[0]
+ X[tmpind] = N.matrixmultiply(X[tmpind], cho[s].transpose()) + mu[s]
+ else:
+ raise GmmParamError('cov matrix mode not recognized')
+
+ return X
+
+def gen_gmm_param(d, K, varmode = 'diag', spread = 1):
+ """Generate valid parameters for a gaussian mixture model.
+ d is the dimension, K the number of components, and varmode
+ the mode for cov matrices.
+
+ Returns:
+ - w
+ - mu
+ - va
+ """
+ w = abs(N.randn(K))
+ w = w / sum(w)
+
+ mu = spread * N.randn(K, d)
+ if varmode == 'diag':
+ va = abs(N.randn(K, d))
+ elif varmode == 'full':
+ va = N.randn(K * d, d)
+ for k in range(K):
+ va[k*d:k*d+d] = N.matrixmultiply( va[k*d:k*d+d],
+ va[k*d:k*d+d].transpose())
+ else:
+ raise GmmParamError('cov matrix mode not recognized')
+
+ return w, mu, va
+
+def check_gmm_param(w, mu, va):
+ """Check that w, mu and va are valid parameters for
+ a mixture of gaussian: w should sum to 1, there should
+ be the same number of component in each param, the variances
+ should be positive definite, etc...
+
+ Params:
+ w = vector or list of weigths of the mixture (K elements)
+ mu = matrix: K * d
+ va = list of variances (vector K * d or square matrices Kd * d)
+
+ returns:
+ K = number of components
+ d = dimension
+ mode = 'diag' if diagonal covariance, 'full' of full matrices
+ """
+
+ # Check that w is valid
+ if N.fabs(N.sum(w) - 1) > MAX_DEV:
+ raise GmmParamError('weight does not sum to 1')
+
+ if not len(w.shape) == 1:
+ raise GmmParamError('weight is not a vector')
+
+ # Check that mean and va have the same number of components
+ K = len(w)
+
+ if N.ndim(mu) < 2:
+ msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
+ raise GmmParamError(msg)
+ if N.ndim(va) < 2:
+ msg = """va should be a K,d / K *d, d matrix, and a row vector if
+ only 1 diag comp"""
+ raise GmmParamError(msg)
+
+ (Km, d) = mu.shape
+ (Ka, da) = va.shape
+
+ if not K == Km:
+ msg = "not same number of component in mean and weights"
+ raise GmmParamError(msg)
+
+ if not d == da:
+ msg = "not same number of dimensions in mean and variances"
+ raise GmmParamError(msg)
+
+ if Km == Ka:
+ mode = 'diag'
+ else:
+ mode = 'full'
+ if not Ka == Km*d:
+ msg = "not same number of dimensions in mean and variances"
+ raise GmmParamError(msg)
+
+ return K, d, mode
+
+# For EM on GMM
+def multiple_gauss_den(data, mu, va):
+ """Helper function to generate several Gaussian
+ pdf (different parameters) from the same data"""
+ mu = N.atleast_2d(mu)
+ va = N.atleast_2d(va)
+
+ K = mu.shape[0]
+ n = data.shape[0]
+ d = data.shape[1]
+
+ y = N.zeros((n, K), float)
+ if mu.size == va.size:
+ for i in range(K):
+ y[:, i] = densities.gauss_den(data, mu[i, :], va[i, :])
+ else:
+ for i in range(K):
+ y[:, i] = densities.gauss_den(data, mu[i, :],
+ va[d*i:d*i+d, :])
+
+ return y
+
+def gmm_init_kmean(data, k, mode, init = [], niter = 10):
+ """gmm_init_kmean(data, k, mode, init = [], niter = 10)
+
+ Init EM for GMM with kmean from data, for k components.
+
+ Args:
+ - data: Each row of data is one frame of dimension d.
+ - k: Number of components to look for
+ - mode: Diagonal or Full covariance matrices
+ - init: The initial centroids. The value given for k is
+ ignored, and the number of row in initc is used instead.
+ If initc is not given, then the centroids are initialized
+ with the k first values of data.
+ - niter: Number of iterations in kmean.
+
+ Returns:
+ (w, mu, va), initial parameters for a GMM.
+
+ Method:
+ Each weight is equiprobable, each mean is one centroid returned by kmean, and
+ covariances for component i is initialized with covariance of
+ data corresponding with label i. Other strategies are possible, this one
+ is an easy one"""
+ if len(init) == 0:
+ init = data[0:k, :]
+ else:
+ k = initc.shape[0]
+
+ if data.ndim == 1:
+ d = 1
+ else:
+ d = N.shape(data)[1]
+
+ (code, label) = kmean(data, init, niter)
+
+ w = N.ones(k, float) / k
+ mu = code.copy()
+ if mode == 'diag':
+ va = N.zeros((k, d), float)
+ for i in range(k):
+ for j in range(d):
+ va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
+ elif mode == 'full':
+ va = N.zeros((k*d, d), float)
+ for i in range(k):
+ va[i*d:i*d+d,:] = N.cov(data[N.where(label==i)], rowvar = 0)
+ else:
+ raise GmmParamError("mode " + str(mode) + " not recognized")
+
+ return w, mu, va
+
+def gmm_posterior(data, w, mu, va):
+ """ Computes the latent variable distribution (a
+ posteriori probability) knowing the explicit data
+ for the Gaussian model (w, mu, var): gamma(t, i) =
+ P[state = i | observation = data(t); w, mu, va]
+
+ This is basically the E step of EM for GMM.
+
+ the second returned value is the non normalized version
+ of gamma, and may be needed for some computation,
+ like eg likelihood"""
+ n = data.shape[0]
+ K = len(w)
+
+ # compute the gaussian pdf
+ tgd = multiple_gauss_den(data, mu, va)
+ # multiply by the weight
+ tgd *= w
+ # Normalize to get a pdf
+ gd = tgd / N.sum(tgd, axis=1)[:, N.NewAxis]
+
+ return gd, tgd
+
+# This function is just calling gmm_update and gmm_posterior, with
+# initialization. This is ugly, and we should have a class to model a GMM
+# instead of all this code to try to guess correct values and parameters...
+def gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
+ """
+ gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
+
+ Compute the parameters of a Gaussian Mixture Model using EM algorithm,
+ with initial values w, mu and va (overwritten by the function).
+
+ Args:
+ - data: contains the observed features, one row is one frame, ie one
+ observation of dimension d
+ - niter: number of iterations
+ - mode: 'diag' or 'full', depending on the wanted model for cov
+ matrices.
+ - K: number of components
+ - w, mu, va initial parameters for the GMM. All or none must be given.
+ If no initial values are given, initialized by gmm_init_kmean; if they
+ are given, mode and k are ignored, and guessed from the given parameters
+ instead.
+
+ Returns:
+ w, mu, va, like as found by EM, where like is the likelihood for each
+ iteration.
+ """
+ if len(w) == 0:
+ w, mu, va = gmm_init_kmean(data, k, mode, 5)
+ else:
+ k, d, mode = check_gmm_parm(w, mu, va)
+
+ like = N.zeros(niter, float)
+ for i in range(niter):
+ g, tgd = gmm_posterior(data, w, mu, va)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)))
+ w, mu, va = gmm_update(X, g, d, k, mode)
+
+ return w, mu, va, like
+
+def gmm_update(data, gamma, d, K, varmode):
+ """Computes update of the Gaussian Mixture Model (M step)
+ from the a posteriori pdf, computed by gmm_posterior
+ (E step).
+ """
+ n = data.shape[0]
+ invn = 1.0/n
+ mGamma = N.sum(gamma)
+
+ if varmode == 'diag':
+ mu = N.zeros((K, d), float)
+ va = N.zeros((K, d), float)
+ for k in range(K):
+ x = N.sum(N.outerproduct(gamma[:, k],
+ N.ones((1, d))) * data)
+ xx = N.sum(N.outerproduct(gamma[:, k],
+ N.ones((1, d))) * (data ** 2))
+
+ mu[k,:] = x / mGamma[k]
+ va[k,:] = xx / mGamma[k] - mu[k,:] ** 2
+ w = invn * mGamma
+
+ elif varmode == 'full':
+ mu = N.zeros((K, d), float)
+ va = N.zeros((K*d, d), float)
+
+ # This is really slow because of the loop on each frame
+ # ways to improve that: either use 3d matrices and see
+ # the memory usage increasing in n*d*d,
+ # using C (or pyrex, or ...) or using the trick commented
+ # which loops on the dimensions instead. If the number of dimensions
+ # is high, this won't help, though...
+ for k in range(K):
+ x = N.sum(N.outerproduct(gamma[:, k],
+ N.ones((1, d), float)) * data)
+ xx = N.zeros((d, d), float)
+ # for i in range(n):
+ # xx += gamma[i, k] * N.outerproduct(data[i,:],
+ # data[i,:])
+
+ # This should be much faster
+ for i in range(d):
+ for j in range(d):
+ xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[:,k])
+
+ mu[k,:] = x / mGamma[k]
+ va[k*d:k*d+d,:] = xx / mGamma[k] - \
+ N.outerproduct(mu[k,:], mu[k,:])
+ w = invn * mGamma
+ else:
+ raise GmmParamError("varmode not recognized")
+
+ return w, mu, va
+
+# Misc functions
+def gmm_ellipses(mu, va, c = [0, 1], npoints = 100):
+ """Returns a list of ellipses describing the Gmm
+ defined by mu and va. c is the dimension we are projecting
+ the variances on a 2d space.
+
+ Returns:
+ -Xe: a list of x coordinates for the ellipses
+ -Ye: a list of y coordinates for the ellipses
+
+ Example:
+ Suppose we have w, mu and va as parameters for a mixture, then:
+
+ X = gen_gmm(w, mu, va, 1000)
+ Xe, Ye = gmm_ellipses(mu, va)
+ pylab.plot(X[:,0], X[:, 1], '.')
+ for k in len(w):
+ pylab.plot(Xe[k], Ye[k], 'r')
+
+ Will plot samples X draw from the mixture model, and
+ plot the ellipses of equi-probability from the mean with
+ fixed level of confidence 0.39.
+
+ TODO: be able to modify the confidence interval to arbitrary
+ value (to do in gauss_ell)"""
+ K = mu.shape[0]
+ w = N.ones(K, float) / K
+
+ K, d, mode = check_gmm_param(w, mu, va)
+
+ # TODO: adjustable level (to do in gauss_ell).
+ # For now, a level of 0.39 means that we draw
+ # ellipses for 1 standard deviation.
+ Xe = []
+ Ye = []
+ if mode == 'diag':
+ for i in range(K):
+ xe, ye = densities.gauss_ell(mu[i,:], va[i,:], dim = c,
+ npoints = npoints)
+ Xe.append(xe)
+ Ye.append(ye)
+ elif mode == 'full':
+ for i in range(K):
+ xe, ye = densities.gauss_ell(mu[i,:], va[i*d:i*d+d,:], dim = c,
+ npoints = npoints)
+ Xe.append(xe)
+ Ye.append(ye)
+
+ return Xe, Ye
+
+import c_gmm
+def kmean(data, init, iter = 10):
+ """Simple kmean implementation for EM
+
+ returns a tuple (code, label), where code are the final
+ centroids, and label are the class label indec for each
+ frame (ie row) of data"""
+
+ data = N.atleast_2d(data)
+ init = N.atleast_2d(init)
+
+ (n, d) = data.shape
+ (k, d1) = init.shape
+
+ if not d == d1:
+ msg = "data and init centers do not have same dimensions..."
+ raise GmmParamError(msg)
+
+ code = N.asarray(init.copy(), float)
+ for i in range(iter):
+ # Compute the nearest neighbour for each obs
+ # using the current code book
+ label = c_gmm._vq(data, code)
+ # Update the code by computing centroids using the new code book
+ for j in range(k):
+ code[j,:] = N.mean(data[N.where(label==j)])
+
+ return code, label
+
+def _vq(data, code):
+ """ Please do not use directly. Use kmean instead"""
+ # No attempt to be efficient has been made...
+ (n, d) = data.shape
+ (k, d) = code.shape
+
+ label = N.zeros(n, int)
+ for i in range(n):
+ d = N.sum((data[i, :] - code) ** 2, 1)
+ label[i] = N.argmin(d)
+
+ return label
+
+# Test functions usable for now
+def test_kmean():
+ X = N.array([[3.0, 3], [4, 3], [4, 2],
+ [9, 2], [5, 1], [6, 2], [9, 4],
+ [5, 2], [5, 4], [7, 4], [6, 5]])
+
+ initc = N.concatenate(([[X[0]], [X[1]], [X[2]]]))
+
+ codet1 = N.array([[3.0000, 3.0000],
+ [6.2000, 4.0000],
+ [5.8000, 1.8000]])
+
+ codet2 = N.array([[11.0/3, 8.0/3],
+ [6.7500, 4.2500],
+ [6.2500, 1.7500]])
+
+ code = initc.copy()
+
+ code1 = kmean(X, code, 1)[0]
+ code2 = kmean(X, code, 2)[0]
+
+ import numpy.testing as testing
+ try:
+ testing.assert_array_almost_equal(code1, codet1)
+ print "kmean test 1 succeded"
+ except AssertionError:
+ print "kmean test 1 failed"
+
+ try:
+ testing.assert_array_almost_equal(code2, codet2)
+ print "kmean test 2 succeded"
+ except AssertionError:
+ print "kmean test 2 failed"
+
+# Test functions (unusable by other people for now....)
+def test_gmm_den():
+ """"""
+ import tables
+
+ filename = 'gmm_test.h5'
+
+ k = 3
+ d = 2
+ mode = 'full'
+ w, mu, va = gen_gmm_param(d, k, mode, 2)
+ X = gen_gmm(w, mu, va, 1e3)
+
+ h5file = tables.openFile(filename, "w")
+
+ h5file.createArray(h5file.root, 'mu', mu)
+ h5file.createArray(h5file.root, 'va', va)
+ h5file.createArray(h5file.root, 'X', X)
+
+ Y1 = densities.gauss_den(X, mu[0,:], va[0:2,:])
+ Y2 = densities.gauss_den(X, mu[1,:], va[2:4,:])
+ Y3 = densities.gauss_den(X, mu[2,:], va[4:6,:])
+
+ Y = multiple_gauss_den(X, mu, va)
+
+ h5file.createArray(h5file.root, 'Y', Y)
+ h5file.createArray(h5file.root, 'Y2', Y2)
+ h5file.createArray(h5file.root, 'Y1', Y1)
+ h5file.createArray(h5file.root, 'Y3', Y3)
+
+ h5file.close()
+
+def test_gmm_em():
+ """"""
+ import numpy as N
+ import tables
+ import pylab as P
+ filename = 'data.h5'
+
+ k = 3
+ d = 2
+ mode = 'full'
+ wr, mur, var = gen_gmm_param(d, k, mode, 2)
+ X = gen_gmm(wr, mur, var, 1e3)
+
+ h5file = tables.openFile(filename, "w")
+
+ h5file.createArray(h5file.root, 'wr', wr)
+ h5file.createArray(h5file.root, 'mur', mur)
+ h5file.createArray(h5file.root, 'var', var)
+ h5file.createArray(h5file.root, 'X', X)
+
+ P.plot(X[:, 0], X[:, 1], '.')
+ Xe, Ye = gmm_ellipses(mur, var, npoints = 100)
+ for i in range(k):
+ P.plot(Xe[i], Ye[i], 'g')
+
+ # init EM: simply use kmean
+ initc = X[0:k, :]
+ (code, label) = kmean(X, initc, 10)
+
+ h5file.createArray(h5file.root, 'initc', initc)
+
+ w0 = N.ones(k, float) / k
+ mu0 = code.copy()
+ if mode == 'diag':
+ va0 = N.zeros((k, d), float)
+ for i in range(k):
+ for j in range(d):
+ va0[i,j] = N.cov(X[N.where(label==i), j], rowvar = 0)
+ elif mode == 'full':
+ va0 = N.zeros((k*d, d), float)
+ for i in range(k):
+ va0[i*d:i*d+d,:] = N.cov(X[N.where(label==i)], rowvar = 0)
+
+ h5file.createArray(h5file.root, 'w0', w0)
+ h5file.createArray(h5file.root, 'mu0', mu0)
+ h5file.createArray(h5file.root, 'va0', va0)
+
+
+ # # Use random values
+ # w0 = N.ones(k, float) / k
+ # mu0 = N.randn(k, d)
+ # va0 = N.fabs(N.randn(k, d))
+
+ w = w0.copy()
+ mu = mu0.copy()
+ va = va0.copy()
+
+ X0e, Y0e = gmm_ellipses(mu, va, npoints = 100)
+ for i in range(k):
+ P.plot(X0e[i], Y0e[i], 'k')
+
+ niter = 10
+ like = N.zeros(niter, float)
+
+ for i in range(niter):
+ g, tgd = gmm_posterior(X, w, mu, va)
+ #h5file.createArray(h5file.root, 'g', g)
+ #h5file.createArray(h5file.root, 'tgd', tgd)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)))
+ print like[i]
+ w, mu, va = gmm_update(X, g, d, k, mode)
+
+ h5file.createArray(h5file.root, 'w', w)
+ h5file.createArray(h5file.root, 'mu', mu)
+ h5file.createArray(h5file.root, 'va', va)
+
+ X0e, Y0e = gmm_ellipses(mu, va, npoints = 100)
+ for i in range(k):
+ P.plot(X0e[i], Y0e[i], 'r')
+ P.figure()
+ P.plot(like)
+ h5file.close()
+ P.show()
+
+def _bench1():
+ #===========================================
+ # Diag GMM with 5 components of dimension 20
+ #===========================================
+ k = 5
+ d = 20
+ mode = 'diag'
+
+ print "Generating the mixture"
+ # Generate a model with k components, d dimensions
+ wr, mur, var = gen_gmm_param(d, k, mode, 3)
+ X = gen_gmm(wr, mur, var, 1e4)
+
+ print "Init the mixture"
+ # Init the mixture with kmean
+ w0, mu0, va0 = gmm_init_kmean(X, k, mode, niter = 5)
+
+ # # Use random values instead of kmean
+ # w0 = N.ones(k, float) / k
+ # mu0 = N.randn(k, d)
+ # va0 = N.fabs(N.randn(k, d))
+
+ print "EM computing..."
+ # Copy the initial values because we want to draw them later...
+ w = w0.copy()
+ mu = mu0.copy()
+ va = va0.copy()
+
+ # The actual EM, with likelihood computation
+ niter = 10
+ like = N.zeros(niter, float)
+
+ for i in range(niter):
+ print "post"
+ g, tgd = gmm_posterior(X, w, mu, va)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)))
+ print "update"
+ w, mu, va = gmm_update(X, g, d, k, mode)
+
+
+def benchmark():
+ import profile
+ profile.run('_bench1()', 'bench1prof')
+
+if __name__ == "__main__":
+ #=============================
+ # Simple GMM with 3 components
+ #=============================
+ import pylab as P
+ k = 4
+ d = 2
+ mode = 'diag'
+
+ print "Generating the mixture"
+ # Generate a model with k components, d dimensions
+ wr, mur, var = gen_gmm_param(d, k, mode, 3)
+ X = gen_gmm(wr, mur, var, 1e3)
+
+ print "Init the mixture"
+ # Init the mixture with kmean
+ w0, mu0, va0 = gmm_init_kmean(X, k, mode, niter = 5)
+
+ # # Use random values instead of kmean
+ # w0 = N.ones(k, float) / k
+ # mu0 = N.randn(k, d)
+ # va0 = N.fabs(N.randn(k, d))
+
+ # Copy the initial values because we want to draw them later...
+ w = w0.copy()
+ mu = mu0.copy()
+ va = va0.copy()
+
+ # The actual EM, with likelihood computation
+ niter = 10
+ like = N.zeros(niter, float)
+
+ print "computing..."
+ for i in range(niter):
+ print "post"
+ g, tgd = gmm_posterior(X, w, mu, va)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)))
+ print "update"
+ w, mu, va = gmm_update(X, g, d, k, mode)
+
+ print "drawing..."
+ # Draw what is happening
+ P.subplot(2, 1, 1)
+ P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
+
+ # Real confidence ellipses
+ Xre, Yre = gmm_ellipses(mur, var)
+ P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
+ for i in range(1,k):
+ P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+
+ # Initial confidence ellipses as found by kmean
+ X0e, Y0e = gmm_ellipses(mu0, va0)
+ P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
+ for i in range(1,k):
+ P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
+
+ # Values found by EM
+ Xe, Ye = gmm_ellipses(mu, va)
+ P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
+ for i in range(1,k):
+ P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
+ P.legend(loc = 0)
+ P.subplot(2, 1, 2)
+ P.plot(like)
+ P.title('log likelihood')
+
+ # # Export the figure
+ # F = P.gcf()
+ # DPI = F.get_dpi()
+ # DefaultSize = F.get_size_inches()
+ # # the default is 100dpi for savefig:
+ # F.savefig("example1.png")
+
+ # # Now make the image twice as big, while keeping the fonts and all the
+ # # same size
+ # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) )
+ # Size = F.get_size_inches()
+ # print "Size in Inches", Size
+ # F.savefig("example2.png")
+ P.show()
More information about the Scipy-svn
mailing list