[Scipy-svn] r3139 - trunk/Lib/sandbox/pyem/examples

Mon Jul 2 08:22:38 EDT 2007

Author: cdavid
Date: 2007-07-02 07:22:31 -0500 (Mon, 02 Jul 2007)
New Revision: 3139

Added:
   trunk/Lib/sandbox/pyem/examples/discriminant_analysis.py
Modified:
   trunk/Lib/sandbox/pyem/examples/utils.py
Log:
Add discriminant analysis example

Added: trunk/Lib/sandbox/pyem/examples/discriminant_analysis.py
===================================================================

--- trunk/Lib/sandbox/pyem/examples/discriminant_analysis.py	2007-07-02 10:22:48 UTC (rev 3138)
+++ trunk/Lib/sandbox/pyem/examples/discriminant_analysis.py	2007-07-02 12:22:31 UTC (rev 3139)
@@ -0,0 +1,119 @@
+#! /usr/bin/env python
+# Last Change: Mon Jul 02 09:00 PM 2007 J
+
+__doc__ = """Example of doing classification with mixture of Gaussian. Note
+that this is really a toy example: we do not use testing testset nor cross
+validation.
+
+We use the famous iris database used by Sir R.A. Fisher. You can try to change
+the attributes used for classification, number of components used for the
+mixtures, etc..."""
+
+import numpy as N
+import pylab as P
+
+from scipy.sandbox import pyem
+import utils
+
+data = utils.iris.load()
+# cnames are the class names
+cnames = data.keys()
+
+#--------------------
+# Data pre processing
+#--------------------
+# we use 25 samples of each class (eg half of iris), for
+# learning, and the other half for testing. We use sepal width and petal width
+# only
+ln = 25
+tn = 25
+xdata = {}
+ydata = {}
+# learning data
+ldata = {}
+
+# you can change here the used attributes (sepal vs petal, width vs height)
+for i in cnames:
+    xdata[i] = data[i]['sepal width']
+    ydata[i] = data[i]['petal width']
+    ldata[i] = N.concatenate((xdata[i][:ln, N.newaxis], 
+                              ydata[i][:ln, N.newaxis]), 
+                             axis = 1)
+
+tx = N.concatenate([xdata[i][ln:] for i in cnames])
+ty = N.concatenate([ydata[i][ln:] for i in cnames])
+tdata = N.concatenate((tx[:, N.newaxis], ty[:, N.newaxis]), axis = 1)
+
+# tclass makes the correspondance class <-> index in the testing data tdata
+tclass = {}
+for i in range(3):
+    tclass[cnames[i]] = N.arange(tn * i, tn * (i+1))
+
+#----------------------------
+# Learning and classification
+#----------------------------
+# This function train a mixture model with k components
+def cluster(data, k, mode = 'full'):
+    d = data.shape[1]
+    gm = pyem.GM(d, k, mode)
+    gmm = pyem.GMM(gm)
+    em = pyem.EM()
+    em.train(data, gmm, maxiter = 20)
+    return gm
+
+# Estimate each class with a mixture of nc components
+nc = 2
+mode = 'diag'
+lmod = {}
+for i in cnames:
+    lmod[i] = cluster(ldata[i], nc, mode)
+
+# Classifiy the testing data. Of course, the data are not really IID, because
+# we did not shuffle the testing data, but in this case, this does not change
+# anything.
+p = N.empty((len(tdata), 3))
+for i in range(3):
+    # For each class, computes the likelihood for the testing data
+    p[:, i] = lmod[cnames[i]].pdf(tdata)
+
+# We then take the Maximum A Posteriori class (same than most likely model in
+# this case, since each class is equiprobable)
+cid = N.argmax(p, 1)
+classification = {}
+for i in range(3):
+    classification[cnames[i]] = N.where(cid == i)[0]
+
+correct = {}
+incorrect = {}
+for i in cnames:
+    correct[i] = N.intersect1d(classification[i], tclass[i])
+    incorrect[i] = N.setdiff1d(classification[i], tclass[i])
+
+#-----------------
+# Plot the results
+#-----------------
+csym = {'setosa' : 's', 'versicolor' : 'x', 'virginica' : 'o'}
+P.figure()
+
+# Plot the learning data with the mixtures
+P.subplot(2, 1, 1)
+for i in lmod.values():
+    #i.plot()
+    X, Y, Z, V = i.density_on_grid()
+    P.contourf(X, Y, Z, V)
+
+for i in cnames:
+    P.plot(ldata[i][:, 0], ldata[i][:, 1], csym[i], label = i + ' (learning)')
+P.legend(loc = 'best')
+
+# Plot the results on test dataset (green for correctly classified, red for
+# incorrectly classified)
+P.subplot(2, 1, 2)
+for i in cnames:
+    P.plot(tx[correct[i]], ty[correct[i]], 'g' + csym[i], 
+           label = '%s (correctly classified)' % i)
+    if len(incorrect[i]) > 0:
+        P.plot(tx[incorrect[i]], ty[incorrect[i]], 'r' + csym[i], 
+               label = '%s (incorrectly classified)' % i)
+P.legend(loc = 'best')
+P.show()

Modified: trunk/Lib/sandbox/pyem/examples/utils.py
===================================================================
--- trunk/Lib/sandbox/pyem/examples/utils.py	2007-07-02 10:22:48 UTC (rev 3138)
+++ trunk/Lib/sandbox/pyem/examples/utils.py	2007-07-02 12:22:31 UTC (rev 3139)
@@ -1,15 +1,12 @@
 #! /usr/bin/env python
-# Last Change: Mon Jul 02 02:00 PM 2007 J
+# Last Change: Mon Jul 02 08:00 PM 2007 J
 
 # Various utilities for examples 
 
 import numpy as N
 from numpy.testing import set_package_path, restore_path
 
-# XXX: Bouah, hackish... Will go away once scipydata found its way
-set_package_path()
 from scikits.learn.datasets import oldfaithful, pendigits, iris
-restore_path()
 
 def get_faithful():
     """Return faithful data as a nx2 array, first column being duration, second