[Scipy-svn] r2317 - in trunk/Lib/sandbox/pyem: . profile_data tests
scipy-svn at scipy.org
scipy-svn at scipy.org
Thu Nov 16 00:41:54 EST 2006
Author: cdavid
Date: 2006-11-15 23:41:35 -0600 (Wed, 15 Nov 2006)
New Revision: 2317
Added:
trunk/Lib/sandbox/pyem/misc.py
trunk/Lib/sandbox/pyem/test_reg.py
Modified:
trunk/Lib/sandbox/pyem/Changelog
trunk/Lib/sandbox/pyem/TODO
trunk/Lib/sandbox/pyem/_c_densities.py
trunk/Lib/sandbox/pyem/densities.py
trunk/Lib/sandbox/pyem/gauss_mix.py
trunk/Lib/sandbox/pyem/gmm_em.py
trunk/Lib/sandbox/pyem/info.py
trunk/Lib/sandbox/pyem/profile_data/profile_densities.py
trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py
trunk/Lib/sandbox/pyem/setup.py
trunk/Lib/sandbox/pyem/tests/test_densities.py
Log:
* bump to 0.5.6
* various cosmetic changes
Modified: trunk/Lib/sandbox/pyem/Changelog
===================================================================
--- trunk/Lib/sandbox/pyem/Changelog 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/Changelog 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,3 +1,11 @@
+pyem (0.5.6) Thu, 16 Nov 2006 14:18:19 +0900
+
+ * bump to 0.5.6
+ * Add __str__ and __repr__ for GM and GMM classes
+ * Add regularization method (but not used yet).
+ * Change 'f<8' to N.float64 for ctype enabled densities
+ * Move 'Magic numbers' into a separated python file, misc.py
+
pyem (0.5.5) Tue, 24 Oct 2006 18:30:54 +0900
* Fix a bug inmultiple_gaussian_den which prevents
Modified: trunk/Lib/sandbox/pyem/TODO
===================================================================
--- trunk/Lib/sandbox/pyem/TODO 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/TODO 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,12 +1,12 @@
-# Last Change: Fri Oct 20 12:00 PM 2006 J
+# Last Change: Thu Nov 09 06:00 PM 2006 J
Things which must be implemented for a 1.0 version (in importante order)
- - A class for learning + a classifier
- - test for various length and model size of gaussian densities/GM and GMM
- - a small help/tutorial
- - be complient with scipy module dev guidelines (DEVELOPERS.TXT)
+ - A classifier
+ - basic regularization
Things which would be nice (after 1.0 version):
+ - Bayes prior (hard, suppose MCMC)
+ - variational Bayes (hard ? Not sure yet)
- Integrate libem (libem should be modified so
that it would be easy to package and distribute)
- Other initialization schemes
Modified: trunk/Lib/sandbox/pyem/_c_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/_c_densities.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/_c_densities.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,7 +1,7 @@
#! /usr/bin/python
#
# Copyrighted David Cournapeau
-# Last Change: Thu Oct 19 06:00 PM 2006 J
+# Last Change: Thu Nov 09 05:00 PM 2006 J
# This module uses a C implementation through ctypes, for diagonal cases
# TODO:
@@ -26,12 +26,12 @@
# Requirements for diag gden
_gden = load_library('c_gden.so', __file__)
-arg1 = ndpointer(dtype='<f8')
+arg1 = ndpointer(dtype=N.float64)
arg2 = c_uint
arg3 = c_uint
-arg4 = ndpointer(dtype='<f8')
-arg5 = ndpointer(dtype='<f8')
-arg6 = ndpointer(dtype='<f8')
+arg4 = ndpointer(dtype=N.float64)
+arg5 = ndpointer(dtype=N.float64)
+arg6 = ndpointer(dtype=N.float64)
_gden.gden_diag.argtypes = [arg1, arg2, arg3, arg4, arg5, arg6]
_gden.gden_diag.restype = c_int
Modified: trunk/Lib/sandbox/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/densities.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/densities.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,7 +1,7 @@
#! /usr/bin/python
#
# Copyrighted David Cournapeau
-# Last Change: Thu Oct 05 07:00 PM 2006 J
+# Last Change: Fri Nov 10 10:00 AM 2006 J
import numpy as N
import numpy.linalg as lin
Modified: trunk/Lib/sandbox/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/gauss_mix.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/gauss_mix.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,5 +1,5 @@
# /usr/bin/python
-# Last Change: Tue Oct 03 06:00 PM 2006 J
+# Last Change: Thu Nov 09 06:00 PM 2006 J
# Module to implement GaussianMixture class.
@@ -7,19 +7,14 @@
from numpy.random import randn, rand
import numpy.linalg as lin
import densities
+from misc import _MAX_DBL_DEV
-MAX_DEV = 1e-10
-MAX_COND = 1e10
-
# Right now, two main usages of a Gaussian Model are possible
# - init a Gaussian Model with meta-parameters, and trains it
# - set-up a Gaussian Model to sample it, draw ellipsoides
# of confidences. In this case, we would like to init it with
# known values of parameters. This can be done with the class method
# fromval
-#
-# For now, we have to init with meta-parameters, and set
-# the parameters afterward. There should be a better way ?
# TODO:
# - change bounds methods of GM class instanciations so that it cannot
@@ -48,16 +43,20 @@
"""Gaussian Mixture class. This is a simple container class
to hold Gaussian Mixture parameters (weights, mean, etc...).
It can also draw itself (confidence ellipses) and samples itself.
+ """
- Is initiated by giving dimension, number of components and
- covariance mode"""
-
# I am not sure it is useful to have a spherical mode...
_cov_mod = ['diag', 'full']
+ #===============================
+ # Methods to construct a mixture
+ #===============================
def __init__(self, d, k, mode = 'diag'):
- """Init a Gaussian model of k components, each component being a
- d multi-variate Gaussian, with covariance matrix of style mode"""
+ """Init a Gaussian Mixture of k components, each component being a
+ d multi-variate Gaussian, with covariance matrix of style mode.
+
+ If you want to build a Gaussian Mixture with knowns weights, means
+ and variances, you can use GM.fromvalues method directly"""
if mode not in self._cov_mod:
raise GmParamError("mode %s not recognized" + str(mode))
@@ -116,6 +115,9 @@
res.set_param(weights, mu, sigma)
return res
+ #=====================================================
+ # Fundamental facilities (sampling, confidence, etc..)
+ #=====================================================
def sample(self, nframes):
""" Sample nframes frames from the model """
if not self.is_valid:
@@ -242,6 +244,15 @@
gen_param = classmethod(gen_param)
+ #=======================
+ # Regularization methods
+ #=======================
+ def _regularize(self):
+ raise NotImplemented("No regularization")
+
+ #=================
+ # Plotting methods
+ #=================
def plot(self, *args, **kargs):
"""Plot the ellipsoides directly for the model
@@ -327,6 +338,22 @@
except ImportError:
raise GmParamError("matplotlib not found, cannot plot...")
+ # Syntactic sugar
+ def __repr__(self):
+ repr = ""
+ repr += "Gaussian Mixture:\n"
+ repr += " -> %d dimensions\n" % self.d
+ repr += " -> %d components\n" % self.k
+ repr += " -> %s covariance \n" % self.mode
+ if self.is_valid:
+ repr += "Has initial values"""
+ else:
+ repr += "Has no initial values yet"""
+ return repr
+
+ def __str__(self):
+ return self.__repr__()
+
# Function to generate a random index: this is kept outside any class,
# as the function can be useful for other
def gen_rand_index(p, n):
@@ -366,7 +393,7 @@
"""
# Check that w is valid
- if N.fabs(N.sum(w, 0) - 1) > MAX_DEV:
+ if N.fabs(N.sum(w, 0) - 1) > _MAX_DBL_DEV:
raise GmParamError('weight does not sum to 1')
if not len(w.shape) == 1:
Modified: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/gmm_em.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,5 +1,5 @@
# /usr/bin/python
-# Last Change: Tue Oct 24 06:00 PM 2006 J
+# Last Change: Thu Nov 16 02:00 PM 2006 J
# TODO:
# - which methods to avoid va shrinking to 0 ? There are several options,
@@ -15,6 +15,8 @@
from kmean import kmean
from gauss_mix import GM
+from misc import _DEF_ALPHA, _MIN_DBL_DELTA, _MIN_INV_COND
+
# Error classes
class GmmError(Exception):
"""Base class for exceptions in this module."""
@@ -38,12 +40,9 @@
# sense to use inheritance for # interface specification in python, since its
# dynamic type systeme.
-# Anyway, a mixture class should encapsulates all details concerning a mixture model:
-# - internal parameters for the pdfs
-# - can compute sufficient statistics for EM
-# - can sample a model
-# - can generate random valid parameters for a new pdf (using class method)
-class MixtureModel:
+# Anyway, a mixture model class should encapsulates all details
+# concerning getting sufficient statistics (SS), likelihood and bic.
+class MixtureModel(object):
pass
class ExpMixtureModel(MixtureModel):
@@ -90,7 +89,7 @@
""" Init the model at random."""
k = self.gm.k
d = self.gm.d
- if mode == 'diag':
+ if self.gm.mode == 'diag':
w = N.ones(k) / k
mu = randn(k, d)
va = N.fabs(randn(k, d))
@@ -120,6 +119,7 @@
self.init = init_methods[init]
self.isinit = False
+ self.initst = init
def sufficient_statistics(self, data):
""" Return normalized and non-normalized sufficient statistics
@@ -168,7 +168,10 @@
elif self.gm.mode == 'full':
# In full mode, this is the bottleneck: the triple loop
# kills performances. This is pretty straightforward
- # algebra, so computing it in C should not be too difficult
+ # algebra, so computing it in C should not be too difficult. The
+ # real problem is to have valid covariance matrices, and to keep
+ # them positive definite, maybe with special storage... Not sure
+ # it really worth the risk
mu = N.zeros((k, d))
va = N.zeros((k*d, d))
@@ -239,6 +242,13 @@
n = N.shape(data)[0]
return bic(lk, free_deg, n)
+ # syntactic sugar
+ def __repr__(self):
+ repre = ""
+ repre += "Gaussian Mixture Model\n"
+ repre += " -> initialized by %s\n" % str(self.initst)
+ repre += self.gm.__repr__()
+ return repre
class EM:
"""An EM trainer. An EM trainer
@@ -265,6 +275,8 @@
Returns:
likelihood (one value per iteration).
"""
+ if not isinstance(model, MixtureModel):
+ raise TypeError("expect a MixtureModel as a model")
# Initialize the data (may do nothing depending on the model)
model.init(data)
@@ -282,9 +294,62 @@
model.update_em(data, g)
if has_em_converged(like[i], like[i-1], thresh):
return like[0:i]
+ # # Em computation, with computation of the likelihood
+ # g, tgd = model.sufficient_statistics(data)
+ # like[0] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+ # model.update_em(data, g)
+ # for i in range(1, maxiter):
+ # print "=== Iteration %d ===" % i
+ # isreg = False
+ # for j in range(model.gm.k):
+ # va = model.gm.va[j]
+ # if va.any() < _MIN_INV_COND:
+ # isreg = True
+ # print "\tregularization detected"
+ # print "\t" + str(va)
+ # model.gm.va[j] = regularize_diag(va)
+ # print "\t" + str(va) + ", " + str(model.gm.va[j])
+ # print "\t" + str(gauss_den(data, model.gm.mu[j], model.gm.va[j]))
+ # print "\tend regularization detected"
+ # var = va
+ #
+ # g, tgd = model.sufficient_statistics(data)
+ # try:
+ # assert not( (N.isnan(tgd)).any() )
+ # if isreg:
+ # print var
+ # except AssertionError:
+ # print "tgd is nan..."
+ # print model.gm.va[13,:]
+ # print 1/model.gm.va[13,:]
+ # print densities.gauss_den(data, model.gm.mu[13], model.gm.va[13])
+ # print N.isnan((multiple_gauss_den(data, model.gm.mu, model.gm.va))).any()
+ # print "Exciting"
+ # import sys
+ # sys.exit(-1)
+ # like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
+ # model.update_em(data, g)
+ # assert not( model.gm.va.any() < 1e-6)
+ # if has_em_converged(like[i], like[i-1], thresh):
+ # return like[0:i]
return like
+def regularize_diag(variance, alpha = _DEF_ALPHA):
+ delta = N.sum(variance) / variance.size
+ if delta > _MIN_DBL_DELTA:
+ return variance + alpha * delta
+ else:
+ return variance + alpha * _MIN_DBL_DELTA
+
+def regularize_full(variance):
+ # Trace of a positive definite matrix is always > 0
+ delta = N.trace(variance) / variance.shape[0]
+ if delta > _MIN_DBL_DELTA:
+ return variance + alpha * delta
+ else:
+ return variance + alpha * _MIN_DBL_DELTA
+
# Misc functions
def bic(lk, deg, n):
""" Expects lk to be log likelihood """
Modified: trunk/Lib/sandbox/pyem/info.py
===================================================================
--- trunk/Lib/sandbox/pyem/info.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/info.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -60,7 +60,7 @@
Copyright: David Cournapeau 2006
License: BSD-style (see LICENSE.txt in main source directory)
"""
-version = '0.5.5'
+version = '0.5.6'
depends = ['linalg', 'stats']
ignore = False
Added: trunk/Lib/sandbox/pyem/misc.py
===================================================================
--- trunk/Lib/sandbox/pyem/misc.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/misc.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -0,0 +1,21 @@
+# Last Change: Fri Nov 10 10:00 AM 2006 J
+
+#=====================================================================
+# "magic number", that is number used to control regularization and co
+# Change them at your risk !
+#=====================================================================
+
+# max deviation allowed when comparing double (this is actually stupid,
+# I should actually use a number of decimals)
+_MAX_DBL_DEV = 1e-10
+
+# max conditional number allowed
+_MAX_COND = 1e8
+_MIN_INV_COND = 1/_MAX_COND
+
+# Default alpha for regularization
+_DEF_ALPHA = 1e-1
+
+# Default min delta for regularization
+_MIN_DBL_DELTA = 1e-5
+
Modified: trunk/Lib/sandbox/pyem/profile_data/profile_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/profile_data/profile_densities.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/profile_data/profile_densities.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,7 +1,7 @@
import numpy as N
from numpy.random import randn
-from pyem import densities as D
-from pyem import _c_densities as DC
+from scipy.sandbox.pyem import densities as D
+from scipy.sandbox.pyem import _c_densities as DC
#import tables
def bench(func, mode = 'diag'):
Modified: trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py
===================================================================
--- trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/profile_data/profile_gmm.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,16 +1,14 @@
import numpy as N
-from pyem import GM, GMM
+from scipy.sandbox.pyem import GM, GMM
import copy
-from pyem._c_densities import gauss_den
-
def bench1(mode = 'diag'):
#===========================================
# GMM of 20 comp, 20 dimension, 1e4 frames
#===========================================
d = 15
k = 30
- nframes = 1e4
+ nframes = 1e5
niter = 10
mode = 'diag'
Modified: trunk/Lib/sandbox/pyem/setup.py
===================================================================
--- trunk/Lib/sandbox/pyem/setup.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/setup.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,5 +1,5 @@
#! /usr/bin/env python
-# Last Change: Thu Oct 19 07:00 PM 2006 J
+# Last Change: Thu Nov 09 06:00 PM 2006 J
# TODO:
# - check how to handle cmd line build options with distutils and use
# it in the building process
@@ -16,14 +16,14 @@
DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
AUTHOR ='David Cournapeau',
AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
-URL ='http://ar.media.kyoto-u.ac.jp/members/david',
+URL ='http://ar.media.kyoto-u.ac.jp/members/david/softwares/pyem',
def configuration(parent_package='',top_path=None, package_name='pyem'):
from numpy.distutils.misc_util import Configuration
config = Configuration(package_name,parent_package,top_path,
version = VERSION)
config.add_data_dir('tests')
- config.add_subpackage('profile_data')
+ config.add_data_dir('profile_data')
config.add_extension('c_gden',
#define_macros=[('LIBSVM_EXPORTS', None),
# ('LIBSVM_DLL', None)],
@@ -34,7 +34,8 @@
if __name__ == "__main__":
from numpy.distutils.core import setup
#setup(**configuration(top_path='').todict())
- setup(**configuration(top_path='',))
+ #setup(**configuration(top_path=''))
+ setup(configuration=configuration)
# from distutils.core import setup, Extension
# from pyem import version as pyem_version
#
Added: trunk/Lib/sandbox/pyem/test_reg.py
===================================================================
--- trunk/Lib/sandbox/pyem/test_reg.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/test_reg.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -0,0 +1,44 @@
+import numpy as N
+
+from gauss_mix import GM
+from gmm_em import GMM, EM
+
+from numpy.random import seed
+
+def test_reg():
+ seed(0)
+ # Generate data with a few components
+ d = 2
+ k = 1
+ n = 500
+
+ w, mu, va = GM.gen_param(d, k)
+ gm = GM.fromvalues(w, mu, va)
+
+ data = gm.sample(n)
+
+ # Try to learn with an insane number of components
+ gmm = GMM(GM(d, 30), 'random')
+
+ em = EM()
+ like= em.train(data, gmm, 20, 1e-20)
+
+ # import pylab as P
+ # P.subplot(2, 1, 1)
+ # P.plot(data[:, 0], data[:, 1], '.')
+ # gmm.gm.plot()
+ # P.subplot(2, 1, 2)
+ # P.plot(like)
+ # print like
+ # P.show()
+
+if __name__ == "__main__":
+ # import hotshot, hotshot.stats
+ # profile_file = 'manyk.prof'
+ # prof = hotshot.Profile(profile_file, lineevents=1)
+ # prof.runcall(test_reg)
+ # p = hotshot.stats.load(profile_file)
+ # print p.sort_stats('cumulative').print_stats(20)
+ # prof.close()
+ test_reg()
+
Modified: trunk/Lib/sandbox/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_densities.py 2006-11-10 15:29:03 UTC (rev 2316)
+++ trunk/Lib/sandbox/pyem/tests/test_densities.py 2006-11-16 05:41:35 UTC (rev 2317)
@@ -1,5 +1,5 @@
#! /usr/bin/env python
-# Last Change: Thu Oct 19 07:00 PM 2006 J
+# Last Change: Thu Nov 09 05:00 PM 2006 J
# TODO:
# - having "fake tests" to check that all mode (scalar, diag and full) are
More information about the Scipy-svn
mailing list