[Scipy-svn] r3098 - in trunk/Lib/sandbox/pyem: . tests

Tue Jun 12 00:04:27 EDT 2007

Author: cdavid
Date: 2007-06-11 23:04:14 -0500 (Mon, 11 Jun 2007)
New Revision: 3098

Modified:
   trunk/Lib/sandbox/pyem/densities.py
   trunk/Lib/sandbox/pyem/gmm_em.py
   trunk/Lib/sandbox/pyem/tests/test_densities.py
   trunk/Lib/sandbox/pyem/tests/test_gmm_em.py
Log:
Add logsumexp function + tests. Not used in the code yet, though

Modified: trunk/Lib/sandbox/pyem/densities.py
===================================================================

--- trunk/Lib/sandbox/pyem/densities.py	2007-06-11 10:34:20 UTC (rev 3097)
+++ trunk/Lib/sandbox/pyem/densities.py	2007-06-12 04:04:14 UTC (rev 3098)
@@ -1,7 +1,7 @@
 #! /usr/bin/python
 #
 # Copyrighted David Cournapeau
-# Last Change: Mon Jun 11 07:00 PM 2007 J
+# Last Change: Tue Jun 12 12:00 PM 2007 J
 """This module implements various basic functions related to multivariate
 gaussian, such as pdf estimation, confidence interval/ellipsoids, etc..."""
 
@@ -268,7 +268,13 @@
 
     return elps[0, :], elps[1, :]
 
-def multiple_gauss_den(data, mu, va):
+def logsumexp(x):
+    """Compute log(sum(exp(a), 1)) while avoiding underflow."""
+    axis = 1
+    mc = N.max(x, axis)
+    return mc + N.log(N.sum(N.exp(x-mc[:, N.newaxis]), axis))
+
+def multiple_gauss_den(data, mu, va, log = False):
     """Helper function to generate several Gaussian
     pdf (different parameters) at the same points
 
@@ -283,6 +289,8 @@
             variance of the pdf. One row per different component for diagonal
             covariance (k, d), or d rows per component for full matrix pdf
             (k*d,d).
+        log : boolean
+            if True, returns the log-pdf instead of the pdf.
 
     :Returns:
         Returns a (n, k) array, each column i being the pdf of the ith mean and
@@ -297,11 +305,11 @@
     y = N.zeros((K, n))
     if N.size(mu) == N.size(va):
         for i in range(K):
-            y[i] = gauss_den(data, mu[i, :], va[i, :])
+            y[i] = gauss_den(data, mu[i, :], va[i, :], log)
         return y.T
     else:
         for i in range(K):
-            y[i] = gauss_den(data, mu[i, :], va[d*i:d*i+d, :])
+            y[i] = gauss_den(data, mu[i, :], va[d*i:d*i+d, :], log)
         return y.T
 
 if __name__ == "__main__":

Modified: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py	2007-06-11 10:34:20 UTC (rev 3097)
+++ trunk/Lib/sandbox/pyem/gmm_em.py	2007-06-12 04:04:14 UTC (rev 3098)
@@ -1,5 +1,5 @@
 # /usr/bin/python
-# Last Change: Mon Jun 11 04:00 PM 2007 J
+# Last Change: Tue Jun 12 11:00 AM 2007 J
 
 """Module implementing GMM, a class to estimate Gaussian mixture models using
 EM, and EM, a class which use GMM instances to estimate models parameters using
@@ -101,8 +101,6 @@
             for i in range(k):
                 va[i*d:i*d+d]   = N.dot( va[i*d:i*d+d], 
                     va[i*d:i*d+d].T)
-            #raise GmmParamError("init_random not implemented for "\
-            #        "mode %s yet" % self.gm.mode)
 
         self.gm.set_param(w, mu, va)
         
@@ -150,11 +148,10 @@
         self.isinit = False
         self.initst = init
 
-    def sufficient_statistics(self, data):
+    def compute_responsabilities(self, data):
         """Compute responsabilities.
         
-        Return normalized and non-normalized sufficient statistics from the
-        model.
+        Return normalized and non-normalized respondabilities for the model.
         
         Note
         ----
@@ -325,11 +322,11 @@
         like    = N.zeros(maxiter)
 
         # Em computation, with computation of the likelihood
-        g, tgd      = model.sufficient_statistics(data)
+        g, tgd      = model.compute_responsabilities(data)
         like[0]     = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
         model.update_em(data, g)
         for i in range(1, maxiter):
-            g, tgd      = model.sufficient_statistics(data)
+            g, tgd      = model.compute_responsabilities(data)
             like[i]     = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
             model.update_em(data, g)
             if has_em_converged(like[i], like[i-1], thresh):
@@ -337,21 +334,6 @@
 
         return like
     
-#def regularize_diag(variance, alpha = _DEF_ALPHA):
-#    delta   = N.sum(variance) / variance.size
-#    if delta > _MIN_DBL_DELTA:
-#        return variance + alpha * delta
-#    else:
-#        return variance + alpha * _MIN_DBL_DELTA
-#
-#def regularize_full(variance):
-#    # Trace of a positive definite matrix is always > 0
-#    delta   = N.trace(variance) / variance.shape[0]
-#    if delta > _MIN_DBL_DELTA:
-#        return variance + alpha * delta
-#    else:
-#        return variance + alpha * _MIN_DBL_DELTA
-
 # Misc functions
 def bic(lk, deg, n):
     """ Expects lk to be log likelihood """
@@ -370,115 +352,6 @@
 
 if __name__ == "__main__":
     pass
-    ## import copy
-    ## #=============================
-    ## # Simple GMM with 5 components
-    ## #=============================
-
-    ## #+++++++++++++++++++++++++++++
-    ## # Meta parameters of the model
-    ## #   - k: Number of components
-    ## #   - d: dimension of each Gaussian
-    ## #   - mode: Mode of covariance matrix: full or diag
-    ## #   - nframes: number of frames (frame = one data point = one
-    ## #   row of d elements
-    ## k       = 2 
-    ## d       = 1
-    ## mode    = 'full'
-    ## nframes = 1e3
-
-    ## #+++++++++++++++++++++++++++++++++++++++++++
-    ## # Create an artificial GMM model, samples it
-    ## #+++++++++++++++++++++++++++++++++++++++++++
-    ## print "Generating the mixture"
-    ## # Generate a model with k components, d dimensions
-    ## w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
-    ## gm          = GM(d, k, mode)
-    ## gm.set_param(w, mu, va)
-
-    ## # Sample nframes frames  from the model
-    ## data    = gm.sample(nframes)
-
-    ## #++++++++++++++++++++++++
-    ## # Learn the model with EM
-    ## #++++++++++++++++++++++++
-
-    ## # Init the model
-    ## print "Init a model for learning, with kmean for initialization"
-    ## lgm = GM(d, k, mode)
-    ## gmm = GMM(lgm, 'kmean')
-    ## gmm.init(data)
-
-    ## # Keep the initialized model for drawing
-    ## gm0 = copy.copy(lgm)
-
-    ## # The actual EM, with likelihood computation
-    ## niter   = 10
-    ## like    = N.zeros(niter)
-
-    ## print "computing..."
-    ## for i in range(niter):
-    ##     g, tgd  = gmm.sufficient_statistics(data)
-    ##     like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
-    ##     gmm.update_em(data, g)
-    ## # # Alternative form, by using EM class: as the EM class
-    ## # # is quite rudimentary now, it is not very useful, just save
-    ## # # a few lines
-    ## # em      = EM()
-    ## # like    = em.train(data, gmm, niter)
-
-    ## #+++++++++++++++
-    ## # Draw the model
-    ## #+++++++++++++++
-    ## print "drawing..."
-    ## import pylab as P
-    ## P.subplot(2, 1, 1)
-
-    ## if not d == 1:
-    ##     # Draw what is happening
-    ##     P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
-
-    ##     # Real confidence ellipses
-    ##     Xre, Yre  = gm.conf_ellipses()
-    ##     P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
-    ##     for i in range(1,k):
-    ##         P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
-
-    ##     # Initial confidence ellipses as found by kmean
-    ##     X0e, Y0e  = gm0.conf_ellipses()
-    ##     P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
-    ##     for i in range(1,k):
-    ##         P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
-
-    ##     # Values found by EM
-    ##     Xe, Ye  = lgm.conf_ellipses()
-    ##     P.plot(Xe[0], Ye[0], 'r', label = "confidence ellipsoides found by"
-    ##      "EM")
-    ##     for i in range(1,k):
-    ##         P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
-    ##     P.legend(loc = 0)
-    ## else:
-    ##     # Real confidence ellipses
-    ##     h   = gm.plot1d()
-    ##     [i.set_color('g') for i in h['pdf']]
-    ##     h['pdf'][0].set_label('true pdf')
-
-    ##     # Initial confidence ellipses as found by kmean
-    ##     h0  = gm0.plot1d()
-    ##     [i.set_color('k') for i in h0['pdf']]
-    ##     h0['pdf'][0].set_label('initial pdf')
-
-    ##     # Values found by EM
-    ##     hl  = lgm.plot1d(fill = 1, level = 0.66)
-    ##     [i.set_color('r') for i in hl['pdf']]
-    ##     hl['pdf'][0].set_label('pdf found by EM')
-
-    ##     P.legend(loc = 0)
-
-    ## P.subplot(2, 1, 2)
-    ## P.plot(like)
-    ## P.title('log likelihood')
-
     ## # #++++++++++++++++++
     ## # # Export the figure
     ## # #++++++++++++++++++

Modified: trunk/Lib/sandbox/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_densities.py	2007-06-11 10:34:20 UTC (rev 3097)
+++ trunk/Lib/sandbox/pyem/tests/test_densities.py	2007-06-12 04:04:14 UTC (rev 3098)
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-# Last Change: Mon Jun 11 07:00 PM 2007 J
+# Last Change: Tue Jun 12 12:00 PM 2007 J
 
 # TODO:
 #   - having "fake tests" to check that all mode (scalar, diag and full) are
@@ -100,6 +100,58 @@
         self._generate_test_data_1d()
         self._test_log(level)
 
+class test_py_logsumexp(TestDensities):
+    """Class to compare logsumexp vs naive implementation."""
+    def test_underlow(self):
+        """This function checks that logsumexp works as expected."""
+        # We check wether naive implementation would underflow, to be sure we
+        # are actually testing something here.
+        N.seterr(under='raise')
+        try:
+            a = N.array([[-1000]])
+            self.naive_logsumexp(a)
+            raise AssertionError("expected to catch underflow, we should not be here")
+        except FloatingPointError, e:
+            print "Catching underflow, as expected"
+        assert pyem.densities.logsumexp(a) == -1000.
+        try:
+            a = N.array([[-1000, -1000, -1000]])
+            self.naive_logsumexp(a)
+            raise AssertionError("expected to catch underflow, we should not be here")
+        except FloatingPointError, e:
+            print "Catching underflow, as expected"
+        assert_array_almost_equal(pyem.densities.logsumexp(a), -998.90138771)
+
+    def naive_logsumexp(self, data):
+        return N.log(N.sum(N.exp(data), 1)) 
+
+    def test_1d(self):
+        data = N.random.randn(1e1)[:, N.newaxis]
+        mu = N.array([[-5], [-6]])
+        va = N.array([[0.1], [0.1]])
+        y = pyem.densities.multiple_gauss_den(data, mu, va, log = True)
+        a1 = pyem.densities.logsumexp(y)
+        a2 = self.naive_logsumexp(y)
+        assert_array_equal(a1, a2)
+
+    def test_2d_full(self):
+        data = N.random.randn(1e1, 2)
+        mu = N.array([[-3, -1], [3, 3]])
+        va = N.array([[1.1, 0.4], [0.6, 0.8], [0.4, 0.2], [0.3, 0.9]])
+        y = pyem.densities.multiple_gauss_den(data, mu, va, log = True)
+        a1 = pyem.densities.logsumexp(y)
+        a2 = self.naive_logsumexp(y)
+        assert_array_almost_equal(a1, a2, DEF_DEC)
+
+    def test_2d_diag(self):
+        data = N.random.randn(1e1, 2)
+        mu = N.array([[-3, -1], [3, 3]])
+        va = N.array([[1.1, 0.4], [0.6, 0.8]])
+        y = pyem.densities.multiple_gauss_den(data, mu, va, log = True)
+        a1 = pyem.densities.logsumexp(y)
+        a2 = self.naive_logsumexp(y)
+        assert_array_almost_equal(a1, a2, DEF_DEC)
+
 class test_c_implementation(TestDensities):
     def _test(self, level, decimal = DEF_DEC):
         try:

Modified: trunk/Lib/sandbox/pyem/tests/test_gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_gmm_em.py	2007-06-11 10:34:20 UTC (rev 3097)
+++ trunk/Lib/sandbox/pyem/tests/test_gmm_em.py	2007-06-12 04:04:14 UTC (rev 3098)
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-# Last Change: Mon Jun 11 06:00 PM 2007 J
+# Last Change: Tue Jun 12 11:00 AM 2007 J
 
 # For now, just test that all mode/dim execute correctly