[Scipy-svn] r2067 - in trunk/Lib/sandbox/svm: . tests

Mon Jul 10 16:40:15 EDT 2006

Author: fullung
Date: 2006-07-10 15:39:54 -0500 (Mon, 10 Jul 2006)
New Revision: 2067

Added:
   trunk/Lib/sandbox/svm/tests/test_all.py
Modified:
   trunk/Lib/sandbox/svm/classification.py
   trunk/Lib/sandbox/svm/dataset.py
   trunk/Lib/sandbox/svm/model.py
   trunk/Lib/sandbox/svm/regression.py
   trunk/Lib/sandbox/svm/tests/test_classification.py
   trunk/Lib/sandbox/svm/tests/test_regression.py
Log:
Cross-validation for classification and regression.


Modified: trunk/Lib/sandbox/svm/classification.py
===================================================================

--- trunk/Lib/sandbox/svm/classification.py	2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/classification.py	2006-07-10 20:39:54 UTC (rev 2067)
@@ -118,6 +118,27 @@
             self.param.weight = \
                 cast(self.weights.ctypes.data, POINTER(c_double))
 
+    def cross_validate(self, dataset, nr_fold):
+        """
+        Perform cross-validation to determine the suitability of
+        chosen model parameters.
+
+        Data are separated to nr_fold folds. Each fold is validated
+        against a model trained using the data from the remaining
+        (nr_fold-1) folds.
+
+        This function returns the percentage of data that was
+        classified correctly over all the experiments.
+        """
+        problem, y, x = self._create_problem(dataset)
+        target = N.empty((len(dataset.data),), dtype=N.float64)
+        tp = cast(target.ctypes.data, POINTER(c_double))
+        libsvm.svm_cross_validation(problem, self.param, nr_fold, tp)
+        total_correct = 0.
+        for x, t in zip(dataset.data, target):
+            if x[0] == int(t): total_correct += 1
+        return 100.0 * total_correct / len(dataset.data)
+
 class LibSvmCClassificationModel(LibSvmClassificationModel):
     """
     A model for C-SV classification.

Modified: trunk/Lib/sandbox/svm/dataset.py
===================================================================
--- trunk/Lib/sandbox/svm/dataset.py	2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/dataset.py	2006-07-10 20:39:54 UTC (rev 2067)
@@ -28,8 +28,6 @@
 class LibSvmClassificationDataSet(LibSvmDataSet):
     def __init__(self, origdata):
         labels = N.array(map(lambda x: x[0], origdata), dtype=N.intc)
-        assert N.alltrue(labels >= 0), \
-            'labels must be non-negative integers'
         labels.sort()
         self.labels = labels
 

Modified: trunk/Lib/sandbox/svm/model.py
===================================================================
--- trunk/Lib/sandbox/svm/model.py	2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/model.py	2006-07-10 20:39:54 UTC (rev 2067)
@@ -57,7 +57,7 @@
 
         self.param = param
 
-    def fit(self, dataset):
+    def _create_problem(self, dataset):
         # XXX don't poke around in dataset's internals
         problem = libsvm.svm_problem()
         problem.l = len(dataset.data)
@@ -69,7 +69,13 @@
         problem.x = cast(addressof(x), POINTER(POINTER(libsvm.svm_node)))
         problem.y = cast(addressof(y), POINTER(c_double))
         self._check_problem_param(problem, self.param)
+        # XXX keep references to y and x inside problem, if ctypes allows
+        # it (need to confirm this)
+        return problem, y, x
 
+    def fit(self, dataset):
+        problem, y, x = self._create_problem(dataset)
+
         model = libsvm.svm_train(problem, self.param)
 
         # weight parametes are no longer required, so remove to them

Modified: trunk/Lib/sandbox/svm/regression.py
===================================================================
--- trunk/Lib/sandbox/svm/regression.py	2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/regression.py	2006-07-10 20:39:54 UTC (rev 2067)
@@ -3,7 +3,8 @@
     'LibSvmNuRegressionModel'
     ]
 
-from ctypes import cast, POINTER
+import numpy as N
+from ctypes import cast, POINTER, c_double
 
 from model import LibSvmModel
 import libsvm
@@ -46,7 +47,51 @@
         """
         return self.sigma
 
-class LibSvmEpsilonRegressionModel(LibSvmModel):
+class LibSvmRegressionModel(LibSvmModel):
+    Results = LibSvmRegressionResults
+
+    def __init__(self, kernel, **kwargs):
+        LibSvmModel.__init__(self, kernel, **kwargs)
+
+    def cross_validate(self, dataset, nr_fold):
+        """
+        Perform cross-validation to determine the suitability of
+        chosen model parameters.
+
+        Data are separated to nr_fold folds. Each fold is validated
+        against a model trained using the data from the remaining
+        (nr_fold-1) folds.
+
+        This function returns a 2-tuple containing the mean squared
+        error and the squared correlation coefficient.
+        """
+
+        problem, y, x = self._create_problem(dataset)
+        target = N.empty((len(dataset.data),), dtype=N.float64)
+        tp = cast(target.ctypes.data, POINTER(c_double))
+        libsvm.svm_cross_validation(problem, self.param, nr_fold, tp)
+
+        total_error = sumv = sumy = sumvv = sumyy = sumvy = 0.
+        for i in range(len(dataset.data)):
+            v = target[i]
+            y = dataset.data[i][0]
+            sumv = sumv + v
+            sumy = sumy + y
+            sumvv = sumvv + v * v
+            sumyy = sumyy + y * y
+            sumvy = sumvy + v * y
+            total_error = total_error + (v-y) * (v-y)
+
+        # mean squared error
+        mse = total_error / len(dataset.data)
+        # squared correlation coefficient
+        l = len(dataset.data)
+        scc = ((l*sumvy - sumv*sumy) * (l*sumvy - sumv*sumy)) / \
+            ((l*sumvv - sumv*sumv) * (l*sumyy - sumy*sumy))
+
+        return mse, scc
+
+class LibSvmEpsilonRegressionModel(LibSvmRegressionModel):
     """
     A model for epsilon-SV regression.
 
@@ -58,10 +103,8 @@
       Prediction.
     """
 
-    Results = LibSvmRegressionResults
-
     def __init__(self, kernel, epsilon=0.1, cost=1.0, **kwargs):
-        LibSvmModel.__init__(self, kernel, **kwargs)
+        LibSvmRegressionModel.__init__(self, kernel, **kwargs)
         self.epsilon = epsilon
         self.cost = cost
         self.param.svm_type = libsvm.EPSILON_SVR
@@ -69,17 +112,15 @@
         self.param.C = cost
         self.param.probability = True
 
-class LibSvmNuRegressionModel(LibSvmModel):
+class LibSvmNuRegressionModel(LibSvmRegressionModel):
     """
     A model for nu-SV regression.
 
     See also: Schoelkopf, et al. New Support Vector Algorithms.
     """
 
-    Results = LibSvmRegressionResults
-
     def __init__(self, kernel, nu=0.5, cost=1.0, **kwargs):
-        LibSvmModel.__init__(self, kernel, **kwargs)
+        LibSvmRegressionModel.__init__(self, kernel, **kwargs)
         self.nu = nu
         self.cost = cost
         self.param.svm_type = libsvm.NU_SVR

Added: trunk/Lib/sandbox/svm/tests/test_all.py
===================================================================
--- trunk/Lib/sandbox/svm/tests/test_all.py	2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/tests/test_all.py	2006-07-10 20:39:54 UTC (rev 2067)
@@ -0,0 +1,8 @@
+from test_regression import *
+from test_classification import *
+from test_dataset import *
+from test_oneclass import *
+from test_libsvm import *
+
+if __name__ == '__main__':
+    NumpyTest().run()

Modified: trunk/Lib/sandbox/svm/tests/test_classification.py
===================================================================
--- trunk/Lib/sandbox/svm/tests/test_classification.py	2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/tests/test_classification.py	2006-07-10 20:39:54 UTC (rev 2067)
@@ -15,8 +15,14 @@
         weights = [(2, 10.0), (1, 20.0), (0, 30.0)]
         Model(Kernel, weights=weights)
         Model(Kernel, 1.0, weights)
-        model = Model(Kernel, cost=1.0, weights=weights)
+        Model(Kernel, cost=1.0, weights=weights)
 
+        Model = LibSvmNuClassificationModel
+        Model(Kernel)
+        Model(Kernel, nu=0.5)
+        Model(Kernel, weights=weights)
+        Model(Kernel, 0.5, weights)
+
     def check_c_basics(self):
         labels = [0, 1, 1, 2]
         x = [N.array([0, 0]),
@@ -94,6 +100,17 @@
             results = model.fit(traindata)
             results.predict_probability(testdata)
 
+    def check_cross_validate(self):
+        labels = ([-1] * 50) + ([1] * 50)
+        x = N.randn(len(labels), 10)
+        traindata = LibSvmClassificationDataSet(zip(labels, x))
+        kernel = LinearKernel()
+        model = LibSvmCClassificationModel(kernel)
+        nr_fold = 10
+        pcorr = model.cross_validate(traindata, nr_fold)
+        # XXX check cross-validation with and without probability
+        # output enabled
+
     def check_nu_train(self):
         pass
 

Modified: trunk/Lib/sandbox/svm/tests/test_regression.py
===================================================================
--- trunk/Lib/sandbox/svm/tests/test_regression.py	2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/tests/test_regression.py	2006-07-10 20:39:54 UTC (rev 2067)
@@ -1,8 +1,3 @@
-import sys
-import os
-sys.path.insert(0, '..')
-sys.path.insert(0, os.path.join('..','..'))
-
 from numpy.testing import *
 import numpy as N
 
@@ -21,6 +16,13 @@
         model = Model(Kernel, shrinking=False)
         self.assert_(not model.shrinking)
 
+        Model = LibSvmNuRegressionModel
+        Model(Kernel)
+        Model(Kernel, nu=0.5)
+        model = Model(Kernel, 0.5, cache_size=60, tolerance=0.005)
+        self.assertEqual(model.cache_size, 60)
+        self.assertAlmostEqual(model.tolerance, 0.005)
+
     def check_epsilon_train(self):
         y = [10., 20., 30., 40.]
         x = [N.array([0, 0]),
@@ -65,6 +67,15 @@
             predictions = results.predict(testdata)
             assert_array_almost_equal(predictions, expected_y)
 
+    def check_cross_validate(self):
+        y = N.randn(100)
+        x = N.randn(len(y), 10)
+        traindata = LibSvmRegressionDataSet(zip(y, x))
+        kernel = LinearKernel()
+        model = LibSvmEpsilonRegressionModel(kernel)
+        nr_fold = 10
+        mse, scc = model.cross_validate(traindata, nr_fold)
+
     def check_nu_train(self):
         pass