[Scipy-svn] r3099 - in trunk/Lib/sandbox/pyem: . tests

Tue Jun 12 08:21:14 EDT 2007

Author: cdavid
Date: 2007-06-12 07:21:04 -0500 (Tue, 12 Jun 2007)
New Revision: 3099

Modified:
   trunk/Lib/sandbox/pyem/densities.py
   trunk/Lib/sandbox/pyem/gauss_mix.py
   trunk/Lib/sandbox/pyem/gmm_em.py
   trunk/Lib/sandbox/pyem/tests/test_densities.py
   trunk/Lib/sandbox/pyem/tests/test_gauss_mix.py
   trunk/Lib/sandbox/pyem/tests/test_gmm_em.py
Log:
Add function to compute log responsabilities with logsumexp.

Modified: trunk/Lib/sandbox/pyem/densities.py
===================================================================

--- trunk/Lib/sandbox/pyem/densities.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/densities.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,7 +1,7 @@
 #! /usr/bin/python
 #
 # Copyrighted David Cournapeau
-# Last Change: Tue Jun 12 12:00 PM 2007 J
+# Last Change: Tue Jun 12 03:00 PM 2007 J
 """This module implements various basic functions related to multivariate
 gaussian, such as pdf estimation, confidence interval/ellipsoids, etc..."""
 
@@ -167,14 +167,6 @@
     inva    = lin.inv(va)
     fac     = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
 
-    # # Slow version
-    # n       = N.size(x, 0)
-    # y       = N.zeros(n)
-    # for i in range(n):
-    #     y[i] = N.dot(x[i,:],
-    #              N.dot(inva, N.transpose(x[i,:])))
-    # y *= -0.5
-
     # we are using a trick with sum to "emulate" 
     # the matrix multiplication inva * x without any explicit loop
     y   = N.dot((x-mu), inva)

Modified: trunk/Lib/sandbox/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/gauss_mix.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/gauss_mix.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
 # /usr/bin/python
-# Last Change: Mon Jun 11 06:00 PM 2007 J
+# Last Change: Tue Jun 12 03:00 PM 2007 J
 
 """Module implementing GM, a class which represents Gaussian mixtures.
 
@@ -12,7 +12,7 @@
 import numpy as N
 from numpy.random import randn, rand
 import numpy.linalg as lin
-import densities
+import densities as D
 import misc
 
 # Right now, two main usages of a Gaussian Model are possible
@@ -276,13 +276,13 @@
         Ye  = []   
         if self.mode == 'diag':
             for i in range(self.k):
-                xe, ye  = densities.gauss_ell(self.mu[i, :], self.va[i, :], 
+                xe, ye  = D.gauss_ell(self.mu[i, :], self.va[i, :], 
                         dim, npoints, level)
                 Xe.append(xe)
                 Ye.append(ye)
         elif self.mode == 'full':
             for i in range(self.k):
-                xe, ye  = densities.gauss_ell(self.mu[i, :], 
+                xe, ye  = D.gauss_ell(self.mu[i, :], 
                         self.va[i*self.d:i*self.d+self.d, :], 
                         dim, npoints, level)
                 Xe.append(xe)
@@ -317,6 +317,7 @@
             raise GmParamError("Unknown mode")
 
         return True
+
     @classmethod
     def gen_param(cls, d, nc, varmode = 'diag', spread = 1):
         """Generate random, valid parameters for a gaussian mixture model.
@@ -366,6 +367,27 @@
     # def _regularize(self):
     #     raise NotImplemented("No regularization")
 
+    def pdf(self, x, log = False):
+        """Computes the pdf of the model at given points.
+
+        :Parameters:
+            x : ndarray
+                points where to estimate the pdf. One row for one
+                multi-dimensional sample (eg to estimate the pdf at 100
+                different points in 10 dimension, data's shape should be (100,
+                20)).
+            log : bool
+                If true, returns the log pdf instead of the pdf.
+
+        :Returns:
+            y : ndarray
+                the pdf at points x."""
+        if log:
+            return D.logsumexp(N.sum(
+                    D.multiple_gauss_den(x, self.mu, self.va, log = True) * self.w, 1))
+        else:
+            return N.sum(D.multiple_gauss_den(x, self.mu, self.va) * self.w, 1)
+
     #=================
     # Plotting methods
     #=================
@@ -572,7 +594,7 @@
         # XXX refactor computing pdf
         dmu = self.mu[:, dim]
         dva = self._get_va(dim)
-        den = densities.multiple_gauss_den(xdata, dmu, dva) * self.w
+        den = D.multiple_gauss_den(xdata, dmu, dva) * self.w
         den = N.sum(den, 1)
         den = den.reshape(len(rangey), len(rangex))
 

Modified: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/gmm_em.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
 # /usr/bin/python
-# Last Change: Tue Jun 12 11:00 AM 2007 J
+# Last Change: Tue Jun 12 08:00 PM 2007 J
 
 """Module implementing GMM, a class to estimate Gaussian mixture models using
 EM, and EM, a class which use GMM instances to estimate models parameters using
@@ -159,7 +159,7 @@
         knowing the explicit data for the Gaussian model (w, mu, var): gamma(t,
         i) = P[state = i | observation = data(t); w, mu, va]
 
-        This is basically the E step of EM for GMM."""
+        This is basically the E step of EM for finite mixtures."""
         # compute the gaussian pdf
         tgd	= densities.multiple_gauss_den(data, self.gm.mu, self.gm.va)
         # multiply by the weight
@@ -169,6 +169,28 @@
 
         return gd, tgd
 
+    def compute_log_responsabilities(self, data):
+        """Compute log responsabilities.
+        
+        Return normalized and non-normalized responsabilities for the model (in
+        the log domain)
+        
+        Note
+        ----
+        Computes the latent variable distribution (a posteriori probability)
+        knowing the explicit data for the Gaussian model (w, mu, var): gamma(t,
+        i) = P[state = i | observation = data(t); w, mu, va]
+
+        This is basically the E step of EM for finite mixtures."""
+        # compute the gaussian pdf
+        tgd	= densities.multiple_gauss_den(data, self.gm.mu, self.gm.va, log = True)
+        # multiply by the weight
+        tgd	+= N.log(self.gm.w)
+        # Normalize to get a pdf
+        gd	= tgd  - densities.logsumexp(tgd)[:, N.newaxis]
+
+        return gd, tgd
+
     def update_em(self, data, gamma):
         """Computes update of the Gaussian Mixture Model (M step)
         from the a posteriori pdf, computed by gmm_posterior

Modified: trunk/Lib/sandbox/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_densities.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/tests/test_densities.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-# Last Change: Tue Jun 12 12:00 PM 2007 J
+# Last Change: Tue Jun 12 08:00 PM 2007 J
 
 # TODO:
 #   - having "fake tests" to check that all mode (scalar, diag and full) are
@@ -21,7 +21,7 @@
 # import modules that are located in the same directory as this file.
 restore_path()
 
-DEF_DEC = 12
+from testcommon import DEF_DEC
 
 class TestDensities(NumpyTestCase):
     def _generate_test_data_1d(self):

Modified: trunk/Lib/sandbox/pyem/tests/test_gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_gauss_mix.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/tests/test_gauss_mix.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-# Last Change: Mon Jun 11 03:00 PM 2007 J
+# Last Change: Tue Jun 12 03:00 PM 2007 J
 
 # For now, just test that all mode/dim execute correctly
 
@@ -10,6 +10,7 @@
 
 set_package_path()
 from pyem import GM
+from pyem.densities import multiple_gauss_den
 restore_path()
 
 class test_BasicFunc(NumpyTestCase):
@@ -58,6 +59,16 @@
         sva = gm._get_va(dim)
         assert N.all(sva == tva)
 
+    def test_2d_diag_pdf(self):
+        d = 2
+        w = N.array([0.4, 0.6])
+        mu = N.array([[0., 2], [-1, -2]])
+        va = N.array([[1, 0.5], [0.5, 1]])
+        x = N.random.randn(100, 2)
+        gm = GM.fromvalues(w, mu, va)
+        y1 = N.sum(multiple_gauss_den(x, mu, va) * w, 1)
+        y2 = gm.pdf(x)
+        assert_array_almost_equal(y1, y2)
 
 if __name__ == "__main__":
     NumpyTest().run()

Modified: trunk/Lib/sandbox/pyem/tests/test_gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_gmm_em.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/tests/test_gmm_em.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-# Last Change: Tue Jun 12 11:00 AM 2007 J
+# Last Change: Tue Jun 12 09:00 PM 2007 J
 
 # For now, just test that all mode/dim execute correctly
 
@@ -12,6 +12,8 @@
 from pyem import GMM, GM, EM
 restore_path()
 
+from testcommon import DEF_DEC
+
 def load_dataset(filename):
     from scipy.io import loadmat
     dic = loadmat(filename, squeeze_me = False)
@@ -162,5 +164,46 @@
         assert_array_equal(gmm.gm.mu, dic['mu'])
         assert_array_equal(gmm.gm.va, dic['va'])
 
+class test_log_domain(EmTest):
+    """This class tests whether the GMM works in log domain."""
+    def _test_common(self, d, k, mode):
+        dic = load_dataset('%s_%dd_%dk.mat' % (mode, d, k))
+
+        gm = GM.fromvalues(dic['w0'], dic['mu0'], dic['va0'])
+        gmm = GMM(gm, 'test')
+
+        a, na = gmm.compute_responsabilities(dic['data'])
+        la, nla = gmm.compute_log_responsabilities(dic['data'])
+
+        ta = N.log(a)
+        tna = N.log(na)
+        if not N.all(N.isfinite(ta)):
+            print "precision problem for %s, %dd, %dk, need fixing" % (mode, d, k)
+        else:
+            assert_array_almost_equal(ta, la, DEF_DEC)
+
+        if not N.all(N.isfinite(tna)):
+            print "precision problem for %s, %dd, %dk, need fixing" % (mode, d, k)
+        else:
+            assert_array_almost_equal(tna, nla, DEF_DEC)
+
+    def test_2d_diag(self, level = 1):
+        d = 2
+        k = 3
+        mode = 'diag'
+        self._test_common(d, k, mode)
+
+    def test_1d_full(self, level = 1):
+        d = 1
+        k = 4
+        mode = 'diag'
+        self._test_common(d, k, mode)
+
+    def test_2d_full(self, level = 1):
+        d = 2
+        k = 3
+        mode = 'full'
+        self._test_common(d, k, mode)
+
 if __name__ == "__main__":
     NumpyTest().run()