[Scipy-svn] r3082 - in trunk/Lib/sandbox/pyem: . examples

Sat Jun 9 04:05:29 EDT 2007

Author: cdavid
Date: 2007-06-09 03:05:03 -0500 (Sat, 09 Jun 2007)
New Revision: 3082

Added:
   trunk/Lib/sandbox/pyem/examples/pdfestimation.py
   trunk/Lib/sandbox/pyem/examples/pdfestimation1d.py
   trunk/Lib/sandbox/pyem/examples/utils.py
Modified:
   trunk/Lib/sandbox/pyem/TODO
Log:
Add example of pdf estimation with EM

Modified: trunk/Lib/sandbox/pyem/TODO
===================================================================

--- trunk/Lib/sandbox/pyem/TODO	2007-06-09 06:42:03 UTC (rev 3081)
+++ trunk/Lib/sandbox/pyem/TODO	2007-06-09 08:05:03 UTC (rev 3082)
@@ -1,12 +1,12 @@
-# Last Change: Mon Jun 04 07:00 PM 2007 J
+# Last Change: Sat Jun 09 04:00 PM 2007 J
 
-
 Things which must be implemented for a 1.0 version (in importante order)
     - A classifier
     - handle rank 1 for 1d data
     - basic regularization
     - docstrings
-    - demo for pdf estimtation, discriminant analysis and clustering
+    - demo for pdf estimation, discriminant analysis and clustering
+    - scaling of data: maybe something to handle scaling internally ?
 
 Things which would be nice (after 1.0 version):
     - Bayes prior (hard, suppose MCMC)

Added: trunk/Lib/sandbox/pyem/examples/pdfestimation.py
===================================================================
--- trunk/Lib/sandbox/pyem/examples/pdfestimation.py	2007-06-09 06:42:03 UTC (rev 3081)
+++ trunk/Lib/sandbox/pyem/examples/pdfestimation.py	2007-06-09 08:05:03 UTC (rev 3082)
@@ -0,0 +1,50 @@
+#! /usr/bin/env python
+# Last Change: Sat Jun 09 03:00 PM 2007 J
+
+# Example of doing pdf estimation with EM algorithm. Requires matplotlib.
+import numpy as N
+from numpy.testing import set_package_path, restore_path
+
+import pylab as P
+
+set_package_path()
+import pyem
+restore_path()
+import utils
+
+oldfaithful = utils.get_faithful()
+
+# We want the relationship between d(t) and w(t+1), but get_faithful gives
+# d(t), w(t), so we have to shift to get the "usual" faithful data
+waiting = oldfaithful[1:, 1:]
+duration = oldfaithful[:len(waiting), :1]
+dt = N.concatenate((duration, waiting), 1)
+
+# Scale the data so that each component is in [0..1]
+dt = utils.scale(dt)
+
+# This function train a mixture model with k components, returns the trained
+# model and the BIC
+def cluster(data, k, mode = 'full'):
+    d = data.shape[1]
+    gm = pyem.GM(d, k, mode)
+    gmm = pyem.GMM(gm)
+    em = pyem.EM()
+    em.train(data, gmm, maxiter = 20)
+    return gm, gmm.bic(data)
+
+# bc will contain a list of BIC values for each model trained
+bc = []
+mode = 'full'
+for k in range(1, 5):
+    # Train a model of k component, and plots isodensity curve
+    P.subplot(2, 2, k)
+    gm, b = cluster(dt, k = k, mode = mode)
+    bc.append(b)
+
+    X, Y, Z, V = gm.density_on_grid()
+    P.contour(X, Y, Z, V)
+    P.plot(dt[:, 0], dt[:, 1], '.')
+
+print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
+P.show()

Added: trunk/Lib/sandbox/pyem/examples/pdfestimation1d.py
===================================================================
--- trunk/Lib/sandbox/pyem/examples/pdfestimation1d.py	2007-06-09 06:42:03 UTC (rev 3081)
+++ trunk/Lib/sandbox/pyem/examples/pdfestimation1d.py	2007-06-09 08:05:03 UTC (rev 3082)
@@ -0,0 +1,69 @@
+#! /usr/bin/env python
+# Last Change: Sat Jun 09 04:00 PM 2007 J
+
+# Example of doing pdf estimation with EM algorithm. Requires matplotlib.
+import numpy as N
+from numpy.testing import set_package_path, restore_path
+
+import pylab as P
+import matplotlib as MPL
+
+set_package_path()
+import pyem
+restore_path()
+import utils
+
+oldfaithful = utils.get_faithful()
+
+duration = oldfaithful[:, :1]
+waiting = oldfaithful[:, 1:]
+
+#dt = utils.scale(duration)
+#dt = duration / 60.
+dt = waiting / 60.
+
+# This function train a mixture model with k components, returns the trained
+# model and the BIC
+def cluster(data, k):
+    d = data.shape[1]
+    gm = pyem.GM(d, k)
+    gmm = pyem.GMM(gm)
+    em = pyem.EM()
+    em.train(data, gmm, maxiter = 20)
+    return gm, gmm.bic(data)
+
+# bc will contain a list of BIC values for each model trained, gml the
+# corresponding mixture model
+bc = []
+gml = []
+
+for k in range(1, 8):
+    gm, b = cluster(dt, k = k)
+    bc.append(b)
+    gml.append(gm)
+
+mbic = N.argmax(bc)
+
+# Below is code to display a figure with histogram and best model (in the BIC sense)
+# pdf, with the BIC as a function of the number of components on the right.
+P.figure(figsize = [12, 7])
+P.subplot(1, 2, 1)
+h = gml[mbic].plot1d(gpdf=True)
+h['gpdf'][0].set_linestyle('-')
+h['gpdf'][0].set_label('pdf of the mixture')
+h['pdf'][0].set_label('pdf of individual component')
+[l.set_linestyle('-') for l in h['pdf']]
+[l.set_color('g') for l in h['pdf']]
+
+prop = MPL.font_manager.FontProperties(size='smaller')
+P.legend(loc = 'best', prop = prop)
+
+P.hist(dt, 25, normed = 1, fill = False)
+P.xlabel('waiting time between consecutive eruption (in min)')
+
+P.subplot(1, 2, 2)
+P.plot(N.arange(1, 8), bc, 'o:')
+P.xlabel("number of components")
+P.ylabel("BIC")
+print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
+P.show()

Added: trunk/Lib/sandbox/pyem/examples/utils.py
===================================================================
--- trunk/Lib/sandbox/pyem/examples/utils.py	2007-06-09 06:42:03 UTC (rev 3081)
+++ trunk/Lib/sandbox/pyem/examples/utils.py	2007-06-09 08:05:03 UTC (rev 3082)
@@ -0,0 +1,44 @@
+#! /usr/bin/env python
+# Last Change: Fri Jun 08 04:00 PM 2007 J
+
+# Various utilities for examples 
+
+import numpy as N
+from numpy.testing import set_package_path, restore_path
+
+# XXX: Bouah, hackish... Will go away once scipydata found its way
+set_package_path()
+from pyem.data import oldfaithful
+restore_path()
+
+def get_faithful():
+    """Return faithful data as a nx2 array, first column being duration, second
+    being waiting time."""
+    # Load faithful data, convert waiting into integer, remove L, M and S data
+    data = oldfaithful.load()
+    tmp1 = []
+    tmp2 = []
+    for i in data:
+        if not (i[0] == 'L' or i[0] == 'M' or i[0] == 'S'):
+            tmp1.append(i[0])
+            tmp2.append(i[1])
+            
+    waiting = N.array([int(i) for i in tmp1], dtype = N.float)
+    duration = N.array([i for i in tmp2], dtype = N.float)
+
+    waiting = waiting[:, N.newaxis]
+    duration = duration[:, N.newaxis]
+
+    return N.concatenate((waiting, duration), 1)
+
+def scale(data):
+    """ Scale data such as each col is in the range [0..1].
+
+    Note: inplace."""
+    n = N.min(data, 0)
+    m = N.max(data, 0)
+
+    data -= n
+    data /= (m-n)
+    return data
+