[Scipy-svn] r2317 - in trunk/Lib/sandbox/pyem: . profile_data tests

Thu Nov 16 00:41:54 EST 2006

Author: cdavid
Date: 2006-11-15 23:41:35 -0600 (Wed, 15 Nov 2006)
New Revision: 2317

Added:
   trunk/Lib/sandbox/pyem/misc.py
   trunk/Lib/sandbox/pyem/test_reg.py
Modified:
   trunk/Lib/sandbox/pyem/Changelog
   trunk/Lib/sandbox/pyem/TODO
   trunk/Lib/sandbox/pyem/_c_densities.py
   trunk/Lib/sandbox/pyem/densities.py
   trunk/Lib/sandbox/pyem/gauss_mix.py
   trunk/Lib/sandbox/pyem/gmm_em.py
   trunk/Lib/sandbox/pyem/info.py
   trunk/Lib/sandbox/pyem/profile_data/profile_densities.py
   trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py
   trunk/Lib/sandbox/pyem/setup.py
   trunk/Lib/sandbox/pyem/tests/test_densities.py
Log:
 * bump to 0.5.6
 * various cosmetic changes



Modified: trunk/Lib/sandbox/pyem/Changelog
===================================================================

--- trunk/Lib/sandbox/pyem/Changelog	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/Changelog	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,3 +1,11 @@
+pyem (0.5.6) Thu, 16 Nov 2006 14:18:19 +0900
+
+	* bump to 0.5.6
+	* Add __str__ and __repr__ for GM and GMM classes
+	* Add regularization method (but not used yet).
+	* Change 'f<8' to N.float64 for ctype enabled densities
+	* Move 'Magic numbers' into a separated python file, misc.py
+
 pyem (0.5.5) Tue, 24 Oct 2006 18:30:54 +0900
 
 	* Fix a bug inmultiple_gaussian_den which prevents 

Modified: trunk/Lib/sandbox/pyem/TODO
===================================================================
--- trunk/Lib/sandbox/pyem/TODO	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/TODO	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,12 +1,12 @@
-# Last Change: Fri Oct 20 12:00 PM 2006 J
+# Last Change: Thu Nov 09 06:00 PM 2006 J
 
 Things which must be implemented for a 1.0 version (in importante order)
-    - A class for learning + a classifier
-    - test for various length and model size of gaussian densities/GM and GMM
-    - a small help/tutorial
-    - be complient with scipy module dev guidelines (DEVELOPERS.TXT)
+    - A classifier
+    - basic regularization
 
 Things which would be nice (after 1.0 version):
+    - Bayes prior (hard, suppose MCMC)
+    - variational Bayes (hard ? Not sure yet)
     - Integrate libem (libem should be modified so
     that it would be easy to package and distribute)
     - Other initialization schemes

Modified: trunk/Lib/sandbox/pyem/_c_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/_c_densities.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/_c_densities.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,7 +1,7 @@
 #! /usr/bin/python
 #
 # Copyrighted David Cournapeau
-# Last Change: Thu Oct 19 06:00 PM 2006 J
+# Last Change: Thu Nov 09 05:00 PM 2006 J
 
 # This module uses a C implementation through ctypes, for diagonal cases
 # TODO:
@@ -26,12 +26,12 @@
 
 # Requirements for diag gden
 _gden   = load_library('c_gden.so', __file__)
-arg1    = ndpointer(dtype='<f8')
+arg1    = ndpointer(dtype=N.float64)
 arg2    = c_uint
 arg3    = c_uint
-arg4    = ndpointer(dtype='<f8')
-arg5    = ndpointer(dtype='<f8')
-arg6    = ndpointer(dtype='<f8')
+arg4    = ndpointer(dtype=N.float64)
+arg5    = ndpointer(dtype=N.float64)
+arg6    = ndpointer(dtype=N.float64)
 _gden.gden_diag.argtypes    = [arg1, arg2, arg3, arg4, arg5, arg6]
 _gden.gden_diag.restype     = c_int
 

Modified: trunk/Lib/sandbox/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/densities.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/densities.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,7 +1,7 @@
 #! /usr/bin/python
 #
 # Copyrighted David Cournapeau
-# Last Change: Thu Oct 05 07:00 PM 2006 J
+# Last Change: Fri Nov 10 10:00 AM 2006 J
 
 import numpy as N
 import numpy.linalg as lin

Modified: trunk/Lib/sandbox/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/gauss_mix.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/gauss_mix.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,5 +1,5 @@
 # /usr/bin/python
-# Last Change: Tue Oct 03 06:00 PM 2006 J
+# Last Change: Thu Nov 09 06:00 PM 2006 J
 
 # Module to implement GaussianMixture class.
 
@@ -7,19 +7,14 @@
 from numpy.random import randn, rand
 import numpy.linalg as lin
 import densities
+from misc import _MAX_DBL_DEV
 
-MAX_DEV     = 1e-10
-MAX_COND    = 1e10
-
 # Right now, two main usages of a Gaussian Model are possible
 #   - init a Gaussian Model with meta-parameters, and trains it
 #   - set-up a Gaussian Model to sample it, draw ellipsoides 
 #   of confidences. In this case, we would like to init it with
 #   known values of parameters. This can be done with the class method 
 #   fromval
-#
-#   For now, we have to init with meta-parameters, and set 
-#   the parameters afterward. There should be a better way ?
 
 # TODO:
 #   - change bounds methods of GM class instanciations so that it cannot 
@@ -48,16 +43,20 @@
     """Gaussian Mixture class. This is a simple container class
     to hold Gaussian Mixture parameters (weights, mean, etc...).
     It can also draw itself (confidence ellipses) and samples itself.
+    """
 
-    Is initiated by giving dimension, number of components and 
-    covariance mode"""
-
     # I am not sure it is useful to have a spherical mode...
     _cov_mod    = ['diag', 'full']
 
+    #===============================
+    # Methods to construct a mixture
+    #===============================
     def __init__(self, d, k, mode = 'diag'):
-        """Init a Gaussian model of k components, each component being a 
-        d multi-variate Gaussian, with covariance matrix of style mode"""
+        """Init a Gaussian Mixture of k components, each component being a 
+        d multi-variate Gaussian, with covariance matrix of style mode.
+        
+        If you want to build a Gaussian Mixture with knowns weights, means
+        and variances, you can use GM.fromvalues method directly"""
         if mode not in self._cov_mod:
             raise GmParamError("mode %s not recognized" + str(mode))
 
@@ -116,6 +115,9 @@
         res.set_param(weights, mu, sigma)
         return res
         
+    #=====================================================
+    # Fundamental facilities (sampling, confidence, etc..)
+    #=====================================================
     def sample(self, nframes):
         """ Sample nframes frames from the model """
         if not self.is_valid:
@@ -242,6 +244,15 @@
 
     gen_param = classmethod(gen_param)
 
+    #=======================
+    # Regularization methods
+    #=======================
+    def _regularize(self):
+        raise NotImplemented("No regularization")
+
+    #=================
+    # Plotting methods
+    #=================
     def plot(self, *args, **kargs):
         """Plot the ellipsoides directly for the model
         
@@ -327,6 +338,22 @@
         except ImportError:
             raise GmParamError("matplotlib not found, cannot plot...")
 
+    # Syntactic sugar
+    def __repr__(self):
+        repr    = ""
+        repr    += "Gaussian Mixture:\n"
+        repr    += " -> %d dimensions\n" % self.d
+        repr    += " -> %d components\n" % self.k
+        repr    += " -> %s covariance \n" % self.mode
+        if self.is_valid:
+            repr    += "Has initial values"""
+        else:
+            repr    += "Has no initial values yet"""
+        return repr
+
+    def __str__(self):
+        return self.__repr__()
+
 # Function to generate a random index: this is kept outside any class,
 # as the function can be useful for other
 def gen_rand_index(p, n):
@@ -366,7 +393,7 @@
     """
         
     # Check that w is valid
-    if N.fabs(N.sum(w, 0)  - 1) > MAX_DEV:
+    if N.fabs(N.sum(w, 0)  - 1) > _MAX_DBL_DEV:
         raise GmParamError('weight does not sum to 1')
     
     if not len(w.shape) == 1:

Modified: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/gmm_em.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,5 +1,5 @@
 # /usr/bin/python
-# Last Change: Tue Oct 24 06:00 PM 2006 J
+# Last Change: Thu Nov 16 02:00 PM 2006 J
 
 # TODO:
 #   - which methods to avoid va shrinking to 0 ? There are several options, 
@@ -15,6 +15,8 @@
 from kmean import kmean
 from gauss_mix import GM
 
+from misc import _DEF_ALPHA, _MIN_DBL_DELTA, _MIN_INV_COND
+
 # Error classes
 class GmmError(Exception):
     """Base class for exceptions in this module."""
@@ -38,12 +40,9 @@
 # sense to use inheritance for # interface specification in python, since its 
 # dynamic type systeme.
 
-# Anyway, a mixture class should encapsulates all details concerning a mixture model:
-#   - internal parameters for the pdfs
-#   - can compute sufficient statistics for EM
-#   - can sample a model
-#   - can generate random valid parameters for a new pdf (using class method)
-class MixtureModel:
+# Anyway, a mixture model class should encapsulates all details 
+# concerning getting sufficient statistics (SS), likelihood and bic.
+class MixtureModel(object):
     pass
 
 class ExpMixtureModel(MixtureModel):
@@ -90,7 +89,7 @@
         """ Init the model at random."""
         k   = self.gm.k
         d   = self.gm.d
-        if mode == 'diag':
+        if self.gm.mode == 'diag':
             w   = N.ones(k) / k
             mu  = randn(k, d)
             va  = N.fabs(randn(k, d))
@@ -120,6 +119,7 @@
 
         self.init   = init_methods[init]
         self.isinit = False
+        self.initst = init
 
     def sufficient_statistics(self, data):
         """ Return normalized and non-normalized sufficient statistics
@@ -168,7 +168,10 @@
         elif self.gm.mode == 'full':
             # In full mode, this is the bottleneck: the triple loop
             # kills performances. This is pretty straightforward
-            # algebra, so computing it in C should not be too difficult
+            # algebra, so computing it in C should not be too difficult. The
+            # real problem is to have valid covariance matrices, and to keep
+            # them positive definite, maybe with special storage... Not sure
+            # it really worth the risk
             mu  = N.zeros((k, d))
             va  = N.zeros((k*d, d))
 
@@ -239,6 +242,13 @@
         n   = N.shape(data)[0]
         return bic(lk, free_deg, n)
 
+    # syntactic sugar
+    def __repr__(self):
+        repre   = ""
+        repre   += "Gaussian Mixture Model\n"
+        repre   += " -> initialized by %s\n" % str(self.initst)
+        repre   += self.gm.__repr__()
+        return repre
 
 class EM:
     """An EM trainer. An EM trainer
@@ -265,6 +275,8 @@
         Returns:
             likelihood (one value per iteration).
         """
+        if not isinstance(model, MixtureModel):
+            raise TypeError("expect a MixtureModel as a model")
 
         # Initialize the data (may do nothing depending on the model)
         model.init(data)
@@ -282,9 +294,62 @@
             model.update_em(data, g)
             if has_em_converged(like[i], like[i-1], thresh):
                 return like[0:i]
+        # # Em computation, with computation of the likelihood
+        # g, tgd      = model.sufficient_statistics(data)
+        # like[0]     = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+        # model.update_em(data, g)
+        # for i in range(1, maxiter):
+        #     print "=== Iteration %d ===" % i
+        #     isreg   = False
+        #     for j in range(model.gm.k):
+        #         va  = model.gm.va[j]
+        #         if va.any() < _MIN_INV_COND:
+        #             isreg   = True
+        #             print "\tregularization detected"
+        #             print "\t" + str(va)
+        #             model.gm.va[j]  = regularize_diag(va)
+        #             print "\t" + str(va) + ", " + str(model.gm.va[j])
+        #             print "\t" + str(gauss_den(data, model.gm.mu[j], model.gm.va[j]))
+        #             print "\tend regularization detected"
+        #             var = va
+        #         
+        #     g, tgd      = model.sufficient_statistics(data)
+        #     try:
+        #         assert not( (N.isnan(tgd)).any() )
+        #         if isreg:
+        #             print var
+        #     except AssertionError:
+        #         print "tgd is nan..."
+        #         print model.gm.va[13,:]
+        #         print 1/model.gm.va[13,:]
+        #         print densities.gauss_den(data, model.gm.mu[13], model.gm.va[13])
+        #         print N.isnan((multiple_gauss_den(data, model.gm.mu, model.gm.va))).any()
+        #         print "Exciting"
+        #         import sys
+        #         sys.exit(-1)
+        #     like[i]     = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+        #     model.update_em(data, g)
+        #     assert not( model.gm.va.any() < 1e-6)
+        #     if has_em_converged(like[i], like[i-1], thresh):
+        #         return like[0:i]
 
         return like
     
+def regularize_diag(variance, alpha = _DEF_ALPHA):
+    delta   = N.sum(variance) / variance.size
+    if delta > _MIN_DBL_DELTA:
+        return variance + alpha * delta
+    else:
+        return variance + alpha * _MIN_DBL_DELTA
+
+def regularize_full(variance):
+    # Trace of a positive definite matrix is always > 0
+    delta   = N.trace(variance) / variance.shape[0]
+    if delta > _MIN_DBL_DELTA:
+        return variance + alpha * delta
+    else:
+        return variance + alpha * _MIN_DBL_DELTA
+
 # Misc functions
 def bic(lk, deg, n):
     """ Expects lk to be log likelihood """

Modified: trunk/Lib/sandbox/pyem/info.py
===================================================================
--- trunk/Lib/sandbox/pyem/info.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/info.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -60,7 +60,7 @@
 Copyright: David Cournapeau 2006
 License: BSD-style (see LICENSE.txt in main source directory)
 """
-version = '0.5.5'
+version = '0.5.6'
 
 depends = ['linalg', 'stats']
 ignore  = False

Added: trunk/Lib/sandbox/pyem/misc.py
===================================================================
--- trunk/Lib/sandbox/pyem/misc.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/misc.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -0,0 +1,21 @@
+# Last Change: Fri Nov 10 10:00 AM 2006 J
+
+#=====================================================================
+# "magic number", that is number used to control regularization and co
+# Change them at your risk !
+#=====================================================================
+
+# max deviation allowed when comparing double (this is actually stupid,
+# I should actually use a number of decimals)
+_MAX_DBL_DEV    = 1e-10
+
+# max conditional number allowed
+_MAX_COND       = 1e8
+_MIN_INV_COND   = 1/_MAX_COND
+
+# Default alpha for regularization
+_DEF_ALPHA  = 1e-1
+
+# Default min delta for regularization
+_MIN_DBL_DELTA  = 1e-5
+

Modified: trunk/Lib/sandbox/pyem/profile_data/profile_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/profile_data/profile_densities.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/profile_data/profile_densities.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,7 +1,7 @@
 import numpy as N
 from numpy.random import randn
-from pyem import densities as D
-from pyem import _c_densities as DC
+from scipy.sandbox.pyem import densities as D
+from scipy.sandbox.pyem import _c_densities as DC
 #import tables
 
 def bench(func, mode = 'diag'):

Modified: trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py
===================================================================
--- trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,16 +1,14 @@
 import numpy as N
-from pyem import GM, GMM
+from scipy.sandbox.pyem import GM, GMM
 import copy
 
-from pyem._c_densities import gauss_den
-
 def bench1(mode = 'diag'):
     #===========================================
     # GMM of 20 comp, 20 dimension, 1e4 frames
     #===========================================
     d       = 15
     k       = 30
-    nframes = 1e4
+    nframes = 1e5
     niter   = 10
     mode    = 'diag'
 

Modified: trunk/Lib/sandbox/pyem/setup.py
===================================================================
--- trunk/Lib/sandbox/pyem/setup.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/setup.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-# Last Change: Thu Oct 19 07:00 PM 2006 J
+# Last Change: Thu Nov 09 06:00 PM 2006 J
 # TODO:
 #   - check how to handle cmd line build options with distutils and use
 #   it in the building process
@@ -16,14 +16,14 @@
 DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
 AUTHOR      ='David Cournapeau',
 AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
-URL         ='http://ar.media.kyoto-u.ac.jp/members/david',
+URL         ='http://ar.media.kyoto-u.ac.jp/members/david/softwares/pyem',
 
 def configuration(parent_package='',top_path=None, package_name='pyem'):
     from numpy.distutils.misc_util import Configuration
     config = Configuration(package_name,parent_package,top_path,
              version     = VERSION)
     config.add_data_dir('tests')
-    config.add_subpackage('profile_data')
+    config.add_data_dir('profile_data')
     config.add_extension('c_gden',
                          #define_macros=[('LIBSVM_EXPORTS', None),
                          #               ('LIBSVM_DLL', None)],
@@ -34,7 +34,8 @@
 if __name__ == "__main__":
     from numpy.distutils.core import setup
     #setup(**configuration(top_path='').todict())
-    setup(**configuration(top_path='',))
+    #setup(**configuration(top_path=''))
+    setup(configuration=configuration)
 # from distutils.core import setup, Extension
 # from pyem import version as pyem_version
 # 

Added: trunk/Lib/sandbox/pyem/test_reg.py
===================================================================
--- trunk/Lib/sandbox/pyem/test_reg.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/test_reg.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -0,0 +1,44 @@
+import numpy as N
+
+from gauss_mix import GM
+from gmm_em import GMM, EM
+
+from numpy.random import seed
+
+def test_reg():
+    seed(0)
+    # Generate data with a few components
+    d   = 2
+    k   = 1
+    n   = 500
+
+    w, mu, va   = GM.gen_param(d, k)
+    gm          = GM.fromvalues(w, mu, va)
+
+    data    = gm.sample(n)
+
+    # Try to learn with an insane number of components
+    gmm = GMM(GM(d, 30), 'random')
+
+    em  = EM()
+    like= em.train(data, gmm, 20, 1e-20)
+
+    # import pylab as P
+    # P.subplot(2, 1, 1)
+    # P.plot(data[:, 0], data[:, 1], '.')
+    # gmm.gm.plot()
+    # P.subplot(2, 1, 2)
+    # P.plot(like)
+    # print like
+    # P.show()
+
+if __name__ == "__main__":
+    # import hotshot, hotshot.stats
+    # profile_file    = 'manyk.prof'
+    # prof    = hotshot.Profile(profile_file, lineevents=1)
+    # prof.runcall(test_reg)
+    # p = hotshot.stats.load(profile_file)
+    # print p.sort_stats('cumulative').print_stats(20)
+    # prof.close()
+    test_reg()
+

Modified: trunk/Lib/sandbox/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_densities.py	2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/tests/test_densities.py	2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-# Last Change: Thu Oct 19 07:00 PM 2006 J
+# Last Change: Thu Nov 09 05:00 PM 2006 J
 
 # TODO:
 #   - having "fake tests" to check that all mode (scalar, diag and full) are