[Scipy-svn] r2357 - in trunk/Lib/sandbox/pyem: . src tests

scipy-svn at scipy.org scipy-svn at scipy.org
Wed Dec 6 07:34:37 EST 2006


Author: cdavid
Date: 2006-12-06 06:27:51 -0600 (Wed, 06 Dec 2006)
New Revision: 2357

Added:
   trunk/Lib/sandbox/pyem/densities2.py
   trunk/Lib/sandbox/pyem/src/pure_den.c
Modified:
   trunk/Lib/sandbox/pyem/
   trunk/Lib/sandbox/pyem/online_em.py
   trunk/Lib/sandbox/pyem/setup.py
   trunk/Lib/sandbox/pyem/src/Makefile
   trunk/Lib/sandbox/pyem/tests/test_online_em.py
Log:
Add densities2.py for preliminary axis support and add specialized class for online EM in 1d for a 10-100x speed incread


Property changes on: trunk/Lib/sandbox/pyem
___________________________________________________________________
Name: svn:ignore
   - *.pyc
*.swp
*.pyd
*.so

   + *.pyc
*.swp
*.pyd
*.so
*.prof


Added: trunk/Lib/sandbox/pyem/densities2.py
===================================================================
--- trunk/Lib/sandbox/pyem/densities2.py	2006-12-06 09:22:34 UTC (rev 2356)
+++ trunk/Lib/sandbox/pyem/densities2.py	2006-12-06 12:27:51 UTC (rev 2357)
@@ -0,0 +1,267 @@
+#! /usr/bin/python
+#
+# Copyrighted David Cournapeau
+# Last Change: Wed Dec 06 09:00 PM 2006 J
+
+# New version, with default numpy ordering.
+
+import numpy as N
+import numpy.linalg as lin
+from numpy.random import randn
+from scipy.stats import chi2
+
+# Error classes
+class DenError(Exception):
+    """Base class for exceptions in this module.
+    
+    Attributes:
+        expression -- input expression in which the error occurred
+        message -- explanation of the error"""
+    def __init__(self, message):
+        self.message    = message
+    
+    def __str__(self):
+        return self.message
+
+#============
+# Public API
+#============
+# The following function do all the fancy stuff to check that parameters
+# are Ok, and call the right implementation if args are OK.
+def gauss_den(x, mu, va, log = False, axis = -1):
+    """ Compute multivariate Gaussian density at points x for 
+    mean mu and variance va along specified axis:
+        
+    requirements:
+        * mean must be rank 0 (1d) or rank 1 (multi variate gaussian)
+        * va must be rank 0 (1d), rank 1(multi variate, diag covariance) or rank 2 
+        (multivariate, full covariance).
+        * in 1 dimension case, any rank for mean and va is ok, as long as their size
+        is 1 (eg they contain only 1 element)
+    
+    Caution: if x is rank 1, it is assumed you have a 1d problem. You cannot compute
+    the gaussian densities of only one sample of dimension d; for this, you have
+    to use a rank 2 !
+
+    If log is True, than the log density is returned 
+    (useful for underflow ?)"""
+
+    # If data is rank 1, then we have 1 dimension problem.
+    if x.ndim == 1:
+        d   = 1
+        n   = x.size
+        if not N.size(mu) == 1:
+            raise DenError("for 1 dimension problem, mean must have only one element")
+            
+        if not N.size(va) == 1:
+            raise DenError("for 1 dimension problem, mean must have only one element")
+        
+        return _scalar_gauss_den(x, mu, va, log)
+    # If data is rank 2, then we may have 1 dimension or multi-variate problem
+    elif x.ndim == 2:
+        oaxis   = (axis + 1) % 2
+        n       = x.shape[axis]
+        d       = x.shape[oaxis]
+
+        # Get away with 1d case now
+        if d == 1:
+            return _scalar_gauss_den(x, mu, va, log)
+
+        # Now, d > 1 (numpy attributes should be valid on mean and va now)
+        if not N.size(mu) == d or not mu.ndim == 1:
+            raise DenError("data is %d dimension, but mean's shape is %s" \
+                    % (d, N.shape(mu)) + " (should be (%d,))" % d)
+            
+        isfull  = (va.ndim == 2)
+        if not (N.size(va) == d or (isfull and va.shape[0] == va.shape[1] == d)):
+            raise DenError("va has an invalid shape or number of elements")
+
+        if isfull:
+            # Compute along rows
+            if oaxis == 0:
+                return  _full_gauss_den(x, mu[:, N.newaxis], va, log, axis)
+            else:
+                return  _full_gauss_den(x, mu, va, log, axis)
+        else:
+            return _diag_gauss_den(x, mu, va, log, axis)
+    else:
+        raise RuntimeError("Sorry, only rank up to 2 supported")
+        
+# To plot a confidence ellipse from multi-variate gaussian pdf
+def gauss_ell(mu, va, dim = [0, 1], npoints = 100, level = 0.39):
+    """ Given a mean and covariance for multi-variate
+    gaussian, returns npoints points for the ellipse
+    of confidence given by level (all points will be inside
+    the ellipsoides with a probability equal to level)
+    
+    Returns the coordinate x and y of the ellipse"""
+    
+    c       = N.array(dim)
+
+    if mu.size < 2:
+        raise RuntimeError("this function only make sense for dimension 2 and more")
+
+    if mu.size == va.size:
+        mode    = 'diag'
+    else:
+        if va.ndim == 2:
+            if va.shape[0] == va.shape[1]:
+                mode    = 'full'
+            else:
+                raise DenError("variance not square")
+        else:
+            raise DenError("mean and variance are not dim conformant")
+
+    # If X ~ N(mu, va), then [X` * va^(-1/2) * X] ~ Chi2
+    chi22d  = chi2(2)
+    mahal   = N.sqrt(chi22d.ppf(level))
+    
+    # Generates a circle of npoints
+    theta   = N.linspace(0, 2 * N.pi, npoints)
+    circle  = mahal * N.array([N.cos(theta), N.sin(theta)])
+
+    # Get the dimension which we are interested in:
+    mu  = mu[dim]
+    if mode == 'diag':
+        va      = va[dim]
+        elps    = N.outer(mu, N.ones(npoints))
+        elps    += N.dot(N.diag(N.sqrt(va)), circle)
+    elif mode == 'full':
+        va  = va[c,:][:,c]
+        # Method: compute the cholesky decomp of each cov matrix, that is
+        # compute cova such as va = cova * cova' 
+        # WARN: scipy is different than matlab here, as scipy computes a lower
+        # triangular cholesky decomp: 
+        #   - va = cova * cova' (scipy)
+        #   - va = cova' * cova (matlab)
+        # So take care when comparing results with matlab !
+        cova    = lin.cholesky(va)
+        elps    = N.outer(mu, N.ones(npoints))
+        elps    += N.dot(cova, circle)
+    else:
+        raise DenParam("var mode not recognized")
+
+    return elps[0, :], elps[1, :]
+
+#=============
+# Private Api
+#=============
+# Those 3 functions do almost all the actual computation
+def _scalar_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in scalar case. It assumes all args
+    are conformant, so it should not be used directly
+    
+    Call gauss_den instead"""
+    inva    = 1/va
+    fac     = (2*N.pi) ** (-1/2.0) * N.sqrt(inva)
+    y       = ((x-mu) ** 2) * -0.5 * inva
+    if not log:
+        y   = fac * N.exp(y.ravel())
+    else:
+        y   = y + log(fac)
+
+    return y
+    
+def _diag_gauss_den(x, mu, va, log, axis):
+    """ This function is the actual implementation
+    of gaussian pdf in scalar case. It assumes all args
+    are conformant, so it should not be used directly
+    
+    Call gauss_den instead"""
+    # Diagonal matrix case
+    d   = mu.size
+    if axis % 2 == 0:
+        x  = N.swapaxes(x, 0, 1)
+
+    if not log:
+        inva    = 1/va[0]
+        fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+        y       =  (x[0] - mu[0]) ** 2 * inva * -0.5
+        for i in range(1, d):
+            inva    = 1/va[i]
+            fac     *= N.sqrt(inva)
+            y       += (x[i] - mu[i]) ** 2 * inva * -0.5
+        y   = fac * N.exp(y)
+    else:
+        y   = _scalar_gauss_den(x[0], mu[0], va[0], log)
+        for i in range(1, d):
+            y    +=  _scalar_gauss_den(x[i], mu[i], va[i], log)
+
+    return y
+
+def _full_gauss_den(x, mu, va, log, axis):
+    """ This function is the actual implementation
+    of gaussian pdf in full matrix case. 
+    
+    It assumes all args are conformant, so it should 
+    not be used directly Call gauss_den instead
+    
+    Does not check if va is definite positive (on inversible 
+    for that matter), so the inverse computation and/or determinant
+    would throw an exception."""
+    d       = mu.size
+    inva    = lin.inv(va)
+    fac     = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
+
+    # # Slow version (does not work since version 0.6)
+    # n       = N.size(x, 0)
+    # y       = N.zeros(n)
+    # for i in range(n):
+    #     y[i] = N.dot(x[i,:],
+    #              N.dot(inva, N.transpose(x[i,:])))
+    # y *= -0.5
+
+    # we are using a trick with sum to "emulate" 
+    # the matrix multiplication inva * x without any explicit loop
+    if axis % 2 == 1:
+        y   = N.dot(inva, (x-mu))
+        y   = -0.5 * N.sum(y * (x-mu), 0)
+    else:
+        y   = N.dot((x-mu), inva)
+        y   = -0.5 * N.sum(y * (x-mu), 1)
+
+    if not log:
+        y   = fac * N.exp(y)
+    else:
+        y   = y + N.log(fac)
+ 
+    return y
+
+if __name__ == "__main__":
+    import pylab
+
+    #=========================================
+    # Test plotting a simple diag 2d variance:
+    #=========================================
+    va  = N.array([5, 3])
+    mu  = N.array([2, 3])
+
+    # Generate a multivariate gaussian of mean mu and covariance va
+    X       = randn(2, 1e3)
+    Yc      = N.dot(N.diag(N.sqrt(va)), X)
+    Yc      = Yc.transpose() + mu
+
+    # Plotting
+    Xe, Ye  = gauss_ell(mu, va, npoints = 100)
+    pylab.figure()
+    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+    pylab.plot(Xe, Ye, 'r')
+
+    #=========================================
+    # Test plotting a simple full 2d variance:
+    #=========================================
+    va  = N.array([[0.2, 0.1],[0.1, 0.5]])
+    mu  = N.array([0, 3])
+
+    # Generate a multivariate gaussian of mean mu and covariance va
+    X       = randn(1e3, 2)
+    Yc      = N.dot(lin.cholesky(va), X.transpose())
+    Yc      = Yc.transpose() + mu
+
+    # Plotting
+    Xe, Ye  = gauss_ell(mu, va, npoints = 100, level=0.95)
+    pylab.figure()
+    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+    pylab.plot(Xe, Ye, 'r')
+    pylab.show()

Modified: trunk/Lib/sandbox/pyem/online_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/online_em.py	2006-12-06 09:22:34 UTC (rev 2356)
+++ trunk/Lib/sandbox/pyem/online_em.py	2006-12-06 12:27:51 UTC (rev 2357)
@@ -1,5 +1,5 @@
 # /usr/bin/python
-# Last Change: Mon Oct 23 07:00 PM 2006 J
+# Last Change: Wed Dec 06 09:00 PM 2006 J
 
 #---------------------------------------------
 # This is not meant to be used yet !!!! I am 
@@ -21,9 +21,10 @@
 from numpy import mean
 from numpy.testing import assert_array_almost_equal, assert_array_equal
 
-from gmm_em import ExpMixtureModel, GMM, EM, multiple_gauss_den
+from gmm_em import ExpMixtureModel, GMM, EM
 from gauss_mix import GM
 from kmean import kmean
+import densities2 as D
 
 import copy
 from numpy.random import seed
@@ -50,7 +51,7 @@
         return self.message
 
 class OnGMM(ExpMixtureModel):
-    """A Class for 'online' (ie recursive) EM. Insteand
+    """A Class for 'online' (ie recursive) EM. Instead
     of running the E step on the whole data, the sufficient statistics
     are updated for each new frame of data, and used in the (unchanged)
     M step"""
@@ -135,14 +136,14 @@
 
         self.init   = init_methods[init]
 
-    def compute_sufficient_statistics(self, frame, nu):
-        """ sufficient_statistics(frame, nu)
+    def compute_sufficient_statistics_frame(self, frame, nu):
+        """ sufficient_statistics(frame, nu) for one frame of data
         
-        frame has to be rank 2 !"""
-        gamma   = multiple_gauss_den(frame, self.pmu, self.pva)[0]
+        frame has to be rank 1 !"""
+        gamma   = multiple_gauss_den_frame(frame, self.pmu, self.pva)
         gamma   *= self.pw
         gamma   /= N.sum(gamma)
-        # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
+        # <1>(t) = cw(t), self.cw = cw(t), each element is one component running weight
         #self.cw	= (1 - nu) * self.cw + nu * gamma
         self.cw	*= (1 - nu)
         self.cw += nu * gamma
@@ -151,16 +152,136 @@
             self.cx[k]   = (1 - nu) * self.cx[k] + nu * frame * gamma[k]
             self.cxx[k]  = (1 - nu) * self.cxx[k] + nu * frame ** 2 * gamma[k]
 
-    def update_em(self):
+    def update_em_frame(self):
         for k in range(self.gm.k):
             self.cmu[k]  = self.cx[k] / self.cw[k]
             self.cva[k]  = self.cxx[k] / self.cw[k] - self.cmu[k] ** 2
     
+import _rawden
+
+class OnGMM1d(ExpMixtureModel):
+    """Special purpose case optimized for 1d dimensional cases.
+    
+    Require each frame to be a float, which means the API is a bit
+    different than OnGMM. You are trading elegance for speed here !"""
+    def init_kmean(self, init_data, niter = 5):
+        """ Init the model using kmean."""
+        assert init_data.ndim == 1
+        k   = self.gm.k
+        w   = N.ones(k) / k
+
+        # Init the internal state of EM
+        self.cx     = w * mean(init_data)
+        self.cxx    = w * mean(init_data ** 2)
+
+        # w, mu and va init is the same that in the standard case
+        (code, label)   = kmean(init_data[:, N.newaxis], \
+                init_data[0:k, N.newaxis], niter)
+        mu          = code.copy()
+        va          = N.zeros((k, 1))
+        for i in range(k):
+            va[i] = N.cov(init_data[N.where(label==i)], rowvar = 0)
+
+        self.gm.set_param(w, mu, va)
+        # c* are the parameters which are computed at every step (ie
+        # when a new frame is taken into account
+        self.cw     = self.gm.w
+        self.cmu    = self.gm.mu[:, 0]
+        self.cva    = self.gm.va[:, 0]
+
+        # p* are the parameters used when computing gaussian densities
+        # they are the same than c* in the online case
+        # self.pw     = self.cw.copy()
+        # self.pmu    = self.cmu.copy()
+        # self.pva    = self.cva.copy()
+        self.pw     = self.cw
+        self.pmu    = self.cmu
+        self.pva    = self.cva
+
+    def __init__(self, gm, init_data, init = 'kmean'):
+        self.gm = gm
+        if self.gm.d is not 1:
+            raise RuntimeError("expects 1d gm only !")
+
+        # Possible init methods
+        init_methods    = {'kmean' : self.init_kmean}
+        self.init       = init_methods[init]
+
+    def compute_sufficient_statistics_frame(self, frame, nu):
+        """expects frame and nu to be float. Returns
+        cw, cxx and cxx, eg the sufficient statistics."""
+        _rawden.compute_ss_frame_1d(frame, self.cw, self.cmu, self.cva, 
+                self.cx, self.cxx, nu)
+        return self.cw, self.cx, self.cxx
+
+    def update_em_frame(self, cw, cx, cxx):
+        """Update EM state using SS as returned by 
+        compute_sufficient_statistics_frame. """
+        self.cmu    = cx / cw
+        self.cva    = cxx / cw - self.cmu ** 2
+
+    def compute_em_frame(self, frame, nu):
+        """Run a whole em step for one frame. frame and nu should be float;
+        if you don't need to split E and M steps, this is faster than calling 
+        compute_sufficient_statistics_frame and update_em_frame"""
+        _rawden.compute_em_frame_1d(frame, self.cw, self.cmu, self.cva, \
+                self.cx, self.cxx, nu)
+#class OnlineEM:
+#    def __init__(self, ogm):
+#        """Init Online Em algorithm with ogm, an instance of OnGMM."""
+#        if not isinstance(ogm, OnGMM):
+#            raise TypeError("expect a OnGMM instance for the model")
+#
+#    def init_em(self):
+#        pass
+#
+#    def train(self, data, nu):
+#        pass
+#
+#    def train_frame(self, frame, nu):
+#        pass
+
+def multiple_gauss_den_frame(data, mu, va):
+    """Helper function to generate several Gaussian
+    pdf (different parameters) from one frame of data.
+    
+    Semantics depending on data's rank
+        - rank 0: mu and va expected to have rank 0 or 1
+        - rank 1: mu and va expected to have rank 2."""
+    if N.ndim(data) == 0:
+        # scalar case
+        k   = mu.size
+        inva    = 1/va
+        fac     = (2*N.pi) ** (-1/2.0) * N.sqrt(inva)
+        y       = ((data-mu) ** 2) * -0.5 * inva
+        return   fac * N.exp(y.ravel())
+    elif N.ndim(data) == 1:
+        # multi variate case (general case)
+        k   = mu.shape[0]
+        y   = N.zeros(k, data.dtype)
+        if mu.size == va.size:
+            # diag case
+            for i in range(k):
+                #y[i] = D.gauss_den(data, mu[i], va[i])
+                # This is a bit hackish: _diag_gauss_den implementation's
+                # changes can break this, but I don't see how to easily fix this
+                y[i] = D._diag_gauss_den(data, mu[i], va[i], False, -1)
+            return y
+        else:
+            raise RuntimeError("full not implemented yet")
+            #for i in range(K):
+            #    y[i] = D.gauss_den(data, mu[i, :], 
+            #                va[d*i:d*i+d, :])
+            #return y.T
+    else:
+        raise RuntimeError("frame should be rank 0 or 1 only")
+        
+
 if __name__ == '__main__':
     d       = 1
     k       = 2
     mode    = 'diag'
-    nframes = int(1e3)
+    nframes = int(5e3)
     emiter  = 4
     seed(5)
 
@@ -206,13 +327,29 @@
     for i in range(1, len(lamb)):
         nu[i]	= 1./(1 + lamb[i] / nu[i-1])
 
+    print "meth1"
     # object version of online EM
     for t in range(nframes):
-        ogmm.compute_sufficient_statistics(data[t:t+1, :], nu[t])
-        ogmm.update_em()
+        ogmm.compute_sufficient_statistics_frame(data[t], nu[t])
+        ogmm.update_em_frame()
 
     ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva)
 
+    # 1d optimized version
+    ogm2        = GM(d, k, mode)
+    ogmm2       = OnGMM1d(ogm2, 'kmean')
+    ogmm2.init(init_data[:, 0])
+
+    print "meth2"
+    # object version of online EM
+    for t in range(nframes):
+        ogmm2.compute_sufficient_statistics_frame(data[t, 0], nu[t])
+        ogmm2.update_em_frame()
+
+    #ogmm2.gm.set_param(ogmm2.cw, ogmm2.cmu, ogmm2.cva)
+
+    print ogmm.cw
+    print ogmm2.cw
     #+++++++++++++++
     # Draw the model
     #+++++++++++++++

Modified: trunk/Lib/sandbox/pyem/setup.py
===================================================================
--- trunk/Lib/sandbox/pyem/setup.py	2006-12-06 09:22:34 UTC (rev 2356)
+++ trunk/Lib/sandbox/pyem/setup.py	2006-12-06 12:27:51 UTC (rev 2357)
@@ -1,12 +1,19 @@
 #! /usr/bin/env python
-# Last Change: Thu Nov 09 06:00 PM 2006 J
+# Last Change: Wed Dec 06 08:00 PM 2006 J
 # TODO:
 #   - check how to handle cmd line build options with distutils and use
 #   it in the building process
 
 """ pyem is a small python package to estimate Gaussian Mixtures Models
-from data, using Expectation Maximization"""
+from data, using Expectation Maximization.
 
+Maximum likelihood EM for mixture of Gaussian is implemented, with BIC computation
+for number of cluster assessment.
+
+There is also an experimental online EM version (the EM is updated for each new
+sample), and I plan to add Variational Bayes and/or MCMC support for Bayesian approach
+for estimating meta parameters of mixtures. """
+
 from os.path import join
 # This import from __init__ looks strange, should check whether there is no other way
 from info import version as pyem_version
@@ -28,6 +35,10 @@
                          #define_macros=[('LIBSVM_EXPORTS', None),
                          #               ('LIBSVM_DLL', None)],
                          sources=[join('src', 'c_gden.c')])
+    config.add_extension('_rawden',
+                         #define_macros=[('LIBSVM_EXPORTS', None),
+                         #               ('LIBSVM_DLL', None)],
+                         sources=[join('src', 'pure_den.c')])
 
     return config
 

Modified: trunk/Lib/sandbox/pyem/src/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/src/Makefile	2006-12-06 09:22:34 UTC (rev 2356)
+++ trunk/Lib/sandbox/pyem/src/Makefile	2006-12-06 12:27:51 UTC (rev 2357)
@@ -4,26 +4,36 @@
 PYREX	= python2.4-pyrexc
 
 PYTHONINC	= -I/usr/include/python2.4
-NUMPYINC	= -I/usr/lib/python2.4/site-packages/numpy/core/include
+NUMPYINC	= -I/home/david/local/lib/python2.4/site-packages/numpy/core/include
 OPTIMS		= -O3 -funroll-all-loops -march=pentium4 -msse2
-WARN		= -W -Wall
+WARN		= -Wall -W -Winline -Wstrict-prototypes -Wmissing-prototypes \
+		-Waggregate-return -Wcast-align -Wcast-qual -Wnested-externs \
+		-Wshadow -Wbad-function-cast -Wwrite-strings
 
 CFLAGS	= $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
 
+targets: c_gden.so _rawden.so
+
 c_gden.so: c_gden.o
 	$(LD) -shared -o $@ $< -Wl,-soname,$@
 
+_rawden.so: pure_den.o
+	$(LD) -shared -o $@ $< -Wl,-soname,$@
+
 c_gden.o: c_gden.c
 	$(CC) -c $(CFLAGS) -fPIC $<
 
-c_gmm.so: c_gmm.o
-	$(LD) -shared -o $@ $< -Wl,-soname,$@
-
-c_gmm.o: c_gmm.c
+pure_den.o: pure_den.c
 	$(CC) -c $(CFLAGS) -fPIC $<
 
-c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
-	$(PYREX) $<
+#c_gmm.so: c_gmm.o
+#	$(LD) -shared -o $@ $< -Wl,-soname,$@
+#
+#c_gmm.o: c_gmm.c
+#	$(CC) -c $(CFLAGS) -fPIC $<
+#
+#c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
+#	$(PYREX) $<
 
 clean:
 	rm -f c_gmm.c

Added: trunk/Lib/sandbox/pyem/src/pure_den.c
===================================================================
--- trunk/Lib/sandbox/pyem/src/pure_den.c	2006-12-06 09:22:34 UTC (rev 2356)
+++ trunk/Lib/sandbox/pyem/src/pure_den.c	2006-12-06 12:27:51 UTC (rev 2357)
@@ -0,0 +1,458 @@
+/*
+ * Last Change: Wed Dec 06 08:00 PM 2006 J
+ */
+#include <Python.h>
+#include <numpy/arrayobject.h>
+
+#include <math.h>
+
+PyObject* compute_ss_frame_1d_py(PyObject* dum, PyObject *arg);
+PyObject* compute_em_frame_1d_py(PyObject* dum, PyObject *arg);
+
+/*
+ * Pure C methods
+ */
+static int compute_ss_frame_1d(double sample, int nc, double* w, double* mu, double *va,
+        double *cx, double *cxx, double nu);
+static int update_em_frame_1d(const double* w, const double* cx, const double *cxx,
+        int nc, double *cmu, double *cva);
+
+static PyMethodDef mymethods[] = {
+    {"compute_ss_frame_1d", compute_ss_frame_1d_py, METH_VARARGS, ""},
+    {"compute_em_frame_1d", compute_em_frame_1d_py, METH_VARARGS, ""},
+    {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+/*
+ * function table
+ */
+PyMODINIT_FUNC init_rawden(void)
+{
+    (void)Py_InitModule("_rawden", mymethods);
+    import_array();
+}
+
+PyObject* compute_ss_frame_1d_py(PyObject* self, PyObject *args)
+{
+    PyObject    *w, *mu, *va, *cx, *cxx;
+    PyObject    *w_a, *mu_a, *va_a, *cx_a, *cxx_a;
+    double      nu, sample;
+    npy_intp    ndim, *dims, len;
+    double      *mu_ca, *va_ca, *w_ca, *cx_ca, *cxx_ca;
+
+    const npy_intp  mrank   = 1;
+
+    int         st;
+
+    /*
+     * Init python object holder to NULL so that we can use Py_XDECREF on all 
+     * the objets when something is woring
+     */
+    w   = NULL;
+    mu  = NULL;
+    va  = NULL;
+    cx  = NULL;
+    cxx = NULL;
+
+    w_a     = NULL;
+    mu_a    = NULL;
+    va_a    = NULL;
+    cx_a    = NULL;
+    cxx_a   = NULL;
+    /*
+     * Parsing of args: w, cx and cxx are inout
+     */
+    if (!PyArg_ParseTuple(args, "dOOOOOd",  &sample, &w, &mu, &va, &cx, &cxx, &nu)){
+        return NULL;
+    }
+    if ( (nu > 1) | (nu <= 0) ) {
+        PyErr_SetString(PyExc_TypeError, "nu should be between 0 and 1");
+        return NULL;
+    }
+
+    /* inout entries */
+    w_a     = PyArray_FROM_OTF(w, NPY_DOUBLE, NPY_INOUT_ARRAY);
+    if (w_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "w array not convertible");
+        return NULL;
+    }
+
+    cx_a    = PyArray_FROM_OTF(cx, NPY_DOUBLE, NPY_INOUT_ARRAY);
+    if (cx_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "cx array not convertible");
+        goto fail;
+    }
+
+    cxx_a   = PyArray_FROM_OTF(cxx, NPY_DOUBLE, NPY_INOUT_ARRAY);
+    if (cxx_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "cxx array not convertible");
+        goto fail;
+    }
+
+    /* in only entries */
+    mu_a    = PyArray_FROM_OTF(mu, NPY_DOUBLE, NPY_IN_ARRAY);
+    if (mu_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "mu array not convertible");
+        goto fail;
+    }
+
+    va_a    = PyArray_FROM_OTF(va, NPY_DOUBLE, NPY_IN_ARRAY);
+    if (va_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "va array not convertible");
+        goto fail;
+    }
+
+    /*
+     * Check that in and out have same size and same rank
+     */
+    ndim    = PyArray_NDIM(w_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "w rank should be 1");
+        goto fail;
+    }
+    ndim    = PyArray_NDIM(cx_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "cx rank should be 1");
+        goto fail;
+    }
+    ndim    = PyArray_NDIM(cxx_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "cxx rank should be 1");
+        goto fail;
+    }
+    ndim    = PyArray_NDIM(mu_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "mu rank should be 1");
+        goto fail;
+    }
+    ndim    = PyArray_NDIM(va_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "va rank should be 1");
+        goto fail;
+    }
+
+    dims    = PyArray_DIMS(w_a);
+    len     = dims[0];
+    //fprintf(stderr, "%s:%s, len is %d\n", __FILE__, __func__, len);
+    dims    = PyArray_DIMS(cx_a);
+    if(dims[0] != len) {
+        PyErr_SetString(PyExc_TypeError, "cx shape should match !");
+        goto fail;
+    }
+    dims    = PyArray_DIMS(cxx_a);
+    if(dims[0] != len) {
+        PyErr_SetString(PyExc_TypeError, "cxx shape should match !");
+        goto fail;
+    }
+    dims    = PyArray_DIMS(mu_a);
+    if(dims[0] != len) {
+        PyErr_SetString(PyExc_TypeError, "mu_a shape should match !");
+        goto fail;
+    }
+    dims    = PyArray_DIMS(va_a);
+    if(dims[0] != len) {
+        PyErr_SetString(PyExc_TypeError, "va_a shape should match !");
+        goto fail;
+    }
+
+    /* 
+     * Get pointer to the data
+     */
+    w_ca    = PyArray_DATA(w_a);
+    if (w_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for w_ca");
+        goto fail;
+    }
+    cx_ca   = PyArray_DATA(cx_a);
+    if (cx_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for cx_ca");
+        goto fail;
+    }
+    cxx_ca  = PyArray_DATA(cxx_a);
+    if (w_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for cxx_ca");
+        goto fail;
+    }
+    mu_ca   = PyArray_DATA(mu_a);
+    if (mu_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for mu_ca");
+        goto fail;
+    }
+    va_ca   = PyArray_DATA(va_a);
+    if (va_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for va_ca");
+        goto fail;
+    }
+    /*
+     * Call actual implementation
+     */
+    st  = compute_ss_frame_1d(sample, len, w_ca, mu_ca, va_ca, cx_ca, cxx_ca, nu);
+    if (st) {
+        PyErr_SetString(PyExc_TypeError, "Error while calling multi_gauss....");
+        goto fail;
+    }
+
+    Py_DECREF(w_a);
+    Py_DECREF(cx_a);
+    Py_DECREF(cxx_a);
+    Py_DECREF(mu_a);
+    Py_DECREF(va_a);
+
+    Py_INCREF(Py_None);
+    return Py_None;
+
+fail:
+    Py_XDECREF(w_a);
+    Py_XDECREF(cx_a);
+    Py_XDECREF(cxx_a);
+    Py_XDECREF(mu_a);
+    Py_XDECREF(va_a);
+    return NULL;
+}
+
+PyObject* compute_em_frame_1d_py(PyObject* self, PyObject *args)
+{
+    PyObject    *w, *mu, *va, *cx, *cxx;
+    PyObject    *w_a, *mu_a, *va_a, *cx_a, *cxx_a;
+    double      nu, sample;
+    npy_intp    ndim, *dims, len;
+    double      *mu_ca, *va_ca, *w_ca, *cx_ca, *cxx_ca;
+
+    const npy_intp  mrank   = 1;
+
+    int         st;
+
+    /*
+     * Init python object holder to NULL so that we can use Py_XDECREF on all 
+     * the objets when something is woring
+     */
+    w   = NULL;
+    mu  = NULL;
+    va  = NULL;
+    cx  = NULL;
+    cxx = NULL;
+
+    w_a     = NULL;
+    mu_a    = NULL;
+    va_a    = NULL;
+    cx_a    = NULL;
+    cxx_a   = NULL;
+    /*
+     * Parsing of args: w, cx and cxx are inout
+     */
+    if (!PyArg_ParseTuple(args, "dOOOOOd",  &sample, &w, &mu, &va, &cx, &cxx, &nu)){
+        return NULL;
+    }
+    if ( (nu > 1) | (nu <= 0) ) {
+        PyErr_SetString(PyExc_TypeError, "nu should be between 0 and 1");
+        return NULL;
+    }
+
+    /* inout entries */
+    w_a     = PyArray_FROM_OTF(w, NPY_DOUBLE, NPY_INOUT_ARRAY);
+    if (w_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "w array not convertible");
+        return NULL;
+    }
+
+    cx_a    = PyArray_FROM_OTF(cx, NPY_DOUBLE, NPY_INOUT_ARRAY);
+    if (cx_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "cx array not convertible");
+        goto fail;
+    }
+
+    cxx_a   = PyArray_FROM_OTF(cxx, NPY_DOUBLE, NPY_INOUT_ARRAY);
+    if (cxx_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "cxx array not convertible");
+        goto fail;
+    }
+
+    /* in only entries */
+    mu_a    = PyArray_FROM_OTF(mu, NPY_DOUBLE, NPY_IN_ARRAY);
+    if (mu_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "mu array not convertible");
+        goto fail;
+    }
+
+    va_a    = PyArray_FROM_OTF(va, NPY_DOUBLE, NPY_IN_ARRAY);
+    if (va_a == NULL) {
+        PyErr_SetString(PyExc_TypeError, "va array not convertible");
+        goto fail;
+    }
+
+    /*
+     * Check that in and out have same size and same rank
+     */
+    ndim    = PyArray_NDIM(w_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "w rank should be 1");
+        goto fail;
+    }
+    ndim    = PyArray_NDIM(cx_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "cx rank should be 1");
+        goto fail;
+    }
+    ndim    = PyArray_NDIM(cxx_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "cxx rank should be 1");
+        goto fail;
+    }
+    ndim    = PyArray_NDIM(mu_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "mu rank should be 1");
+        goto fail;
+    }
+    ndim    = PyArray_NDIM(va_a);
+    if(ndim != mrank) {
+        PyErr_SetString(PyExc_TypeError, "va rank should be 1");
+        goto fail;
+    }
+
+    dims    = PyArray_DIMS(w_a);
+    len     = dims[0];
+    //fprintf(stderr, "%s:%s, len is %d\n", __FILE__, __func__, len);
+    dims    = PyArray_DIMS(cx_a);
+    if(dims[0] != len) {
+        PyErr_SetString(PyExc_TypeError, "cx shape should match !");
+        goto fail;
+    }
+    dims    = PyArray_DIMS(cxx_a);
+    if(dims[0] != len) {
+        PyErr_SetString(PyExc_TypeError, "cxx shape should match !");
+        goto fail;
+    }
+    dims    = PyArray_DIMS(mu_a);
+    if(dims[0] != len) {
+        PyErr_SetString(PyExc_TypeError, "mu_a shape should match !");
+        goto fail;
+    }
+    dims    = PyArray_DIMS(va_a);
+    if(dims[0] != len) {
+        PyErr_SetString(PyExc_TypeError, "va_a shape should match !");
+        goto fail;
+    }
+
+    /* 
+     * Get pointer to the data
+     */
+    w_ca    = PyArray_DATA(w_a);
+    if (w_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for w_ca");
+        goto fail;
+    }
+    cx_ca   = PyArray_DATA(cx_a);
+    if (cx_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for cx_ca");
+        goto fail;
+    }
+    cxx_ca  = PyArray_DATA(cxx_a);
+    if (w_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for cxx_ca");
+        goto fail;
+    }
+    mu_ca   = PyArray_DATA(mu_a);
+    if (mu_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for mu_ca");
+        goto fail;
+    }
+    va_ca   = PyArray_DATA(va_a);
+    if (va_ca == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Error for va_ca");
+        goto fail;
+    }
+    /*
+     * Call actual implementation
+     */
+    st  = compute_ss_frame_1d(sample, len, w_ca, mu_ca, va_ca, cx_ca, cxx_ca, nu);
+    if (st) {
+        PyErr_SetString(PyExc_TypeError, "Error while calling multi_gauss....");
+        goto fail;
+    }
+    st  = update_em_frame_1d(w_ca, cx_ca, cxx_ca, len, mu_ca, va_ca);
+    if (st) {
+        PyErr_SetString(PyExc_TypeError, "Error while calling update_em_frame_1d....");
+        goto fail;
+    }
+
+    Py_DECREF(w_a);
+    Py_DECREF(cx_a);
+    Py_DECREF(cxx_a);
+    Py_DECREF(mu_a);
+    Py_DECREF(va_a);
+
+    Py_INCREF(Py_None);
+    return Py_None;
+
+fail:
+    Py_XDECREF(w_a);
+    Py_XDECREF(cx_a);
+    Py_XDECREF(cxx_a);
+    Py_XDECREF(mu_a);
+    Py_XDECREF(va_a);
+    return NULL;
+}
+
+int compute_ss_frame_1d(double sample, int nc, double* w, double* mu, double *va,
+        double *cx, double *cxx, double nu)
+{
+    /*
+     * TODO: check va division
+     */
+    int     i;
+    double  inva, fac, *gam, acc;
+
+    gam = malloc(sizeof(*gam) * nc);
+    if (gam == NULL) {
+        return -1;
+    }
+
+    /* Compute gamma */
+    acc = 0;
+    for (i = 0; i < nc; ++i) {
+        inva        = 1/va[i];
+        fac         = 1 / sqrt(2 * M_PI * va[i]);
+        gam[i]      = fac * exp( -0.5 * inva * (sample - mu[i]) * (sample - mu[i]));
+        gam[i]      *= w[i];
+        acc         += gam[i];
+    }
+    /* Normalize gamma */
+    for (i = 0; i < nc; ++i) {
+        gam[i]  /= acc;
+    }
+
+    /* Compute new SS from EM (cx and cxx) */
+    for (i = 0; i < nc; ++i) {
+        w[i]  *= (1 - nu);
+        w[i]  += nu * gam[i];
+        cx[i]   = (1 - nu) * cx[i] + nu * sample * gam[i]; 
+        cxx[i]  = (1 - nu) * cxx[i] + nu * sample * sample * gam[i]; 
+    }
+
+    free(gam);
+
+    return 0;
+}
+
+/*
+ * update mu and va from SS w, cx and cxx. Only mu and va are modified
+ * all arrays have same length nc
+ */
+int update_em_frame_1d(const double* cw, const double* cx, const double *cxx,
+        int nc, double *cmu, double *cva)
+{
+    /*
+     * TODO: check va division
+     */
+    int     i;
+    double  invw;
+
+    /* Compute new SS from EM (cx and cxx) */
+    for (i = 0; i < nc; ++i) {
+        invw    = 1/cw[i];
+        cmu[i]  = cx[i] * invw;
+        cva[i]  = cxx[i] * invw - cmu[i] * cmu[i]; 
+    }
+
+    return 0;
+}

Modified: trunk/Lib/sandbox/pyem/tests/test_online_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_online_em.py	2006-12-06 09:22:34 UTC (rev 2356)
+++ trunk/Lib/sandbox/pyem/tests/test_online_em.py	2006-12-06 12:27:51 UTC (rev 2357)
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-# Last Change: Thu Nov 16 09:00 PM 2006 J
+# Last Change: Wed Dec 06 09:00 PM 2006 J
 
 import copy
 
@@ -11,7 +11,7 @@
 
 set_package_path()
 from pyem import GM, GMM
-from pyem.online_em import OnGMM
+from pyem.online_em import OnGMM, OnGMM1d
 restore_path()
 
 # #Optional:
@@ -21,6 +21,7 @@
 
 # Error precision allowed (nb of decimals)
 AR_AS_PREC  = 12
+KM_ITER     = 5
 
 class OnlineEmTest(NumpyTestCase):
     def _create_model(self, d, k, mode, nframes, emiter):
@@ -38,7 +39,7 @@
         # Init the model
         lgm = GM(d, k, mode)
         gmm = GMM(lgm, 'kmean')
-        gmm.init(data)
+        gmm.init(data, niter = KM_ITER)
 
         self.gm0    = copy.copy(gmm.gm)
         # The actual EM, with likelihood computation
@@ -91,7 +92,7 @@
         ogm         = GM(d, k, mode)
         ogmm        = OnGMM(ogm, 'kmean')
         init_data   = self.data
-        ogmm.init(init_data)
+        ogmm.init(init_data, niter = KM_ITER)
 
         # Check that online kmean init is the same than kmean offline init
         ogm0    = copy.copy(ogm)
@@ -116,8 +117,8 @@
         ogmm.pva   = ogmm.cva.copy()
         for e in range(emiter):
             for t in range(nframes):
-                ogmm.compute_sufficient_statistics(self.data[t:t+1, :], nu[t])
-                ogmm.update_em()
+                ogmm.compute_sufficient_statistics_frame(self.data[t], nu[t])
+                ogmm.update_em_frame()
 
             # Change pw args only a each epoch 
             ogmm.pw  = ogmm.cw.copy()
@@ -147,12 +148,56 @@
         d       = 1
         k       = 2
         mode    = 'diag'
-        nframes = int(1e3)
+        nframes = int(5e2)
         emiter  = 4
 
         self._create_model(d, k, mode, nframes, emiter)
         self._run_pure_online(d, k, mode, nframes)
     
+    def check_1d_imp(self):
+        d       = 1
+        k       = 2
+        mode    = 'diag'
+        nframes = int(5e2)
+        emiter  = 4
+
+        self._create_model(d, k, mode, nframes, emiter)
+        gmref   = self._run_pure_online(d, k, mode, nframes)
+        gmtest  = self._run_pure_online_1d(d, k, mode, nframes)
+    
+        assert_array_almost_equal(gmref.w, gmtest.w, AR_AS_PREC)
+        assert_array_almost_equal(gmref.mu, gmtest.mu, AR_AS_PREC)
+        assert_array_almost_equal(gmref.va, gmtest.va, AR_AS_PREC)
+
+    def _run_pure_online_1d(self, d, k, mode, nframes):
+        #++++++++++++++++++++++++++++++++++++++++
+        # Approximate the models with online EM
+        #++++++++++++++++++++++++++++++++++++++++
+        ogm     = GM(d, k, mode)
+        ogmm    = OnGMM1d(ogm, 'kmean')
+        init_data   = self.data[0:nframes / 20, :]
+        ogmm.init(init_data[:, 0])
+
+        # Forgetting param
+        ku		= 0.005
+        t0		= 200
+        lamb	= 1 - 1/(N.arange(-1, nframes-1) * ku + t0)
+        nu0		= 0.2
+        nu		= N.zeros((len(lamb), 1))
+        nu[0]	= nu0
+        for i in range(1, len(lamb)):
+            nu[i]	= 1./(1 + lamb[i] / nu[i-1])
+
+        # object version of online EM
+        for t in range(nframes):
+            # the assert are here to check we do not create copies
+            # unvoluntary for parameters
+            a, b, c = ogmm.compute_sufficient_statistics_frame(self.data[t, 0], nu[t])
+            ogmm.update_em_frame(a, b, c)
+
+        ogmm.gm.set_param(ogmm.cw, ogmm.cmu[:, N.newaxis], ogmm.cva[:, N.newaxis])
+
+        return ogmm.gm
     def _run_pure_online(self, d, k, mode, nframes):
         #++++++++++++++++++++++++++++++++++++++++
         # Approximate the models with online EM
@@ -179,10 +224,11 @@
             assert ogmm.pw is ogmm.cw
             assert ogmm.pmu is ogmm.cmu
             assert ogmm.pva is ogmm.cva
-            ogmm.compute_sufficient_statistics(self.data[t:t+1, :], nu[t])
-            ogmm.update_em()
+            ogmm.compute_sufficient_statistics_frame(self.data[t], nu[t])
+            ogmm.update_em_frame()
 
         ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva)
 
+        return ogmm.gm
 if __name__ == "__main__":
     NumpyTest().run()




More information about the Scipy-svn mailing list