[Scipy-svn] r2067 - in trunk/Lib/sandbox/svm: . tests
scipy-svn at scipy.org
scipy-svn at scipy.org
Mon Jul 10 16:40:15 EDT 2006
Author: fullung
Date: 2006-07-10 15:39:54 -0500 (Mon, 10 Jul 2006)
New Revision: 2067
Added:
trunk/Lib/sandbox/svm/tests/test_all.py
Modified:
trunk/Lib/sandbox/svm/classification.py
trunk/Lib/sandbox/svm/dataset.py
trunk/Lib/sandbox/svm/model.py
trunk/Lib/sandbox/svm/regression.py
trunk/Lib/sandbox/svm/tests/test_classification.py
trunk/Lib/sandbox/svm/tests/test_regression.py
Log:
Cross-validation for classification and regression.
Modified: trunk/Lib/sandbox/svm/classification.py
===================================================================
--- trunk/Lib/sandbox/svm/classification.py 2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/classification.py 2006-07-10 20:39:54 UTC (rev 2067)
@@ -118,6 +118,27 @@
self.param.weight = \
cast(self.weights.ctypes.data, POINTER(c_double))
+ def cross_validate(self, dataset, nr_fold):
+ """
+ Perform cross-validation to determine the suitability of
+ chosen model parameters.
+
+ Data are separated to nr_fold folds. Each fold is validated
+ against a model trained using the data from the remaining
+ (nr_fold-1) folds.
+
+ This function returns the percentage of data that was
+ classified correctly over all the experiments.
+ """
+ problem, y, x = self._create_problem(dataset)
+ target = N.empty((len(dataset.data),), dtype=N.float64)
+ tp = cast(target.ctypes.data, POINTER(c_double))
+ libsvm.svm_cross_validation(problem, self.param, nr_fold, tp)
+ total_correct = 0.
+ for x, t in zip(dataset.data, target):
+ if x[0] == int(t): total_correct += 1
+ return 100.0 * total_correct / len(dataset.data)
+
class LibSvmCClassificationModel(LibSvmClassificationModel):
"""
A model for C-SV classification.
Modified: trunk/Lib/sandbox/svm/dataset.py
===================================================================
--- trunk/Lib/sandbox/svm/dataset.py 2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/dataset.py 2006-07-10 20:39:54 UTC (rev 2067)
@@ -28,8 +28,6 @@
class LibSvmClassificationDataSet(LibSvmDataSet):
def __init__(self, origdata):
labels = N.array(map(lambda x: x[0], origdata), dtype=N.intc)
- assert N.alltrue(labels >= 0), \
- 'labels must be non-negative integers'
labels.sort()
self.labels = labels
Modified: trunk/Lib/sandbox/svm/model.py
===================================================================
--- trunk/Lib/sandbox/svm/model.py 2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/model.py 2006-07-10 20:39:54 UTC (rev 2067)
@@ -57,7 +57,7 @@
self.param = param
- def fit(self, dataset):
+ def _create_problem(self, dataset):
# XXX don't poke around in dataset's internals
problem = libsvm.svm_problem()
problem.l = len(dataset.data)
@@ -69,7 +69,13 @@
problem.x = cast(addressof(x), POINTER(POINTER(libsvm.svm_node)))
problem.y = cast(addressof(y), POINTER(c_double))
self._check_problem_param(problem, self.param)
+ # XXX keep references to y and x inside problem, if ctypes allows
+ # it (need to confirm this)
+ return problem, y, x
+ def fit(self, dataset):
+ problem, y, x = self._create_problem(dataset)
+
model = libsvm.svm_train(problem, self.param)
# weight parametes are no longer required, so remove to them
Modified: trunk/Lib/sandbox/svm/regression.py
===================================================================
--- trunk/Lib/sandbox/svm/regression.py 2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/regression.py 2006-07-10 20:39:54 UTC (rev 2067)
@@ -3,7 +3,8 @@
'LibSvmNuRegressionModel'
]
-from ctypes import cast, POINTER
+import numpy as N
+from ctypes import cast, POINTER, c_double
from model import LibSvmModel
import libsvm
@@ -46,7 +47,51 @@
"""
return self.sigma
-class LibSvmEpsilonRegressionModel(LibSvmModel):
+class LibSvmRegressionModel(LibSvmModel):
+ Results = LibSvmRegressionResults
+
+ def __init__(self, kernel, **kwargs):
+ LibSvmModel.__init__(self, kernel, **kwargs)
+
+ def cross_validate(self, dataset, nr_fold):
+ """
+ Perform cross-validation to determine the suitability of
+ chosen model parameters.
+
+ Data are separated to nr_fold folds. Each fold is validated
+ against a model trained using the data from the remaining
+ (nr_fold-1) folds.
+
+ This function returns a 2-tuple containing the mean squared
+ error and the squared correlation coefficient.
+ """
+
+ problem, y, x = self._create_problem(dataset)
+ target = N.empty((len(dataset.data),), dtype=N.float64)
+ tp = cast(target.ctypes.data, POINTER(c_double))
+ libsvm.svm_cross_validation(problem, self.param, nr_fold, tp)
+
+ total_error = sumv = sumy = sumvv = sumyy = sumvy = 0.
+ for i in range(len(dataset.data)):
+ v = target[i]
+ y = dataset.data[i][0]
+ sumv = sumv + v
+ sumy = sumy + y
+ sumvv = sumvv + v * v
+ sumyy = sumyy + y * y
+ sumvy = sumvy + v * y
+ total_error = total_error + (v-y) * (v-y)
+
+ # mean squared error
+ mse = total_error / len(dataset.data)
+ # squared correlation coefficient
+ l = len(dataset.data)
+ scc = ((l*sumvy - sumv*sumy) * (l*sumvy - sumv*sumy)) / \
+ ((l*sumvv - sumv*sumv) * (l*sumyy - sumy*sumy))
+
+ return mse, scc
+
+class LibSvmEpsilonRegressionModel(LibSvmRegressionModel):
"""
A model for epsilon-SV regression.
@@ -58,10 +103,8 @@
Prediction.
"""
- Results = LibSvmRegressionResults
-
def __init__(self, kernel, epsilon=0.1, cost=1.0, **kwargs):
- LibSvmModel.__init__(self, kernel, **kwargs)
+ LibSvmRegressionModel.__init__(self, kernel, **kwargs)
self.epsilon = epsilon
self.cost = cost
self.param.svm_type = libsvm.EPSILON_SVR
@@ -69,17 +112,15 @@
self.param.C = cost
self.param.probability = True
-class LibSvmNuRegressionModel(LibSvmModel):
+class LibSvmNuRegressionModel(LibSvmRegressionModel):
"""
A model for nu-SV regression.
See also: Schoelkopf, et al. New Support Vector Algorithms.
"""
- Results = LibSvmRegressionResults
-
def __init__(self, kernel, nu=0.5, cost=1.0, **kwargs):
- LibSvmModel.__init__(self, kernel, **kwargs)
+ LibSvmRegressionModel.__init__(self, kernel, **kwargs)
self.nu = nu
self.cost = cost
self.param.svm_type = libsvm.NU_SVR
Added: trunk/Lib/sandbox/svm/tests/test_all.py
===================================================================
--- trunk/Lib/sandbox/svm/tests/test_all.py 2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/tests/test_all.py 2006-07-10 20:39:54 UTC (rev 2067)
@@ -0,0 +1,8 @@
+from test_regression import *
+from test_classification import *
+from test_dataset import *
+from test_oneclass import *
+from test_libsvm import *
+
+if __name__ == '__main__':
+ NumpyTest().run()
Modified: trunk/Lib/sandbox/svm/tests/test_classification.py
===================================================================
--- trunk/Lib/sandbox/svm/tests/test_classification.py 2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/tests/test_classification.py 2006-07-10 20:39:54 UTC (rev 2067)
@@ -15,8 +15,14 @@
weights = [(2, 10.0), (1, 20.0), (0, 30.0)]
Model(Kernel, weights=weights)
Model(Kernel, 1.0, weights)
- model = Model(Kernel, cost=1.0, weights=weights)
+ Model(Kernel, cost=1.0, weights=weights)
+ Model = LibSvmNuClassificationModel
+ Model(Kernel)
+ Model(Kernel, nu=0.5)
+ Model(Kernel, weights=weights)
+ Model(Kernel, 0.5, weights)
+
def check_c_basics(self):
labels = [0, 1, 1, 2]
x = [N.array([0, 0]),
@@ -94,6 +100,17 @@
results = model.fit(traindata)
results.predict_probability(testdata)
+ def check_cross_validate(self):
+ labels = ([-1] * 50) + ([1] * 50)
+ x = N.randn(len(labels), 10)
+ traindata = LibSvmClassificationDataSet(zip(labels, x))
+ kernel = LinearKernel()
+ model = LibSvmCClassificationModel(kernel)
+ nr_fold = 10
+ pcorr = model.cross_validate(traindata, nr_fold)
+ # XXX check cross-validation with and without probability
+ # output enabled
+
def check_nu_train(self):
pass
Modified: trunk/Lib/sandbox/svm/tests/test_regression.py
===================================================================
--- trunk/Lib/sandbox/svm/tests/test_regression.py 2006-07-10 15:01:03 UTC (rev 2066)
+++ trunk/Lib/sandbox/svm/tests/test_regression.py 2006-07-10 20:39:54 UTC (rev 2067)
@@ -1,8 +1,3 @@
-import sys
-import os
-sys.path.insert(0, '..')
-sys.path.insert(0, os.path.join('..','..'))
-
from numpy.testing import *
import numpy as N
@@ -21,6 +16,13 @@
model = Model(Kernel, shrinking=False)
self.assert_(not model.shrinking)
+ Model = LibSvmNuRegressionModel
+ Model(Kernel)
+ Model(Kernel, nu=0.5)
+ model = Model(Kernel, 0.5, cache_size=60, tolerance=0.005)
+ self.assertEqual(model.cache_size, 60)
+ self.assertAlmostEqual(model.tolerance, 0.005)
+
def check_epsilon_train(self):
y = [10., 20., 30., 40.]
x = [N.array([0, 0]),
@@ -65,6 +67,15 @@
predictions = results.predict(testdata)
assert_array_almost_equal(predictions, expected_y)
+ def check_cross_validate(self):
+ y = N.randn(100)
+ x = N.randn(len(y), 10)
+ traindata = LibSvmRegressionDataSet(zip(y, x))
+ kernel = LinearKernel()
+ model = LibSvmEpsilonRegressionModel(kernel)
+ nr_fold = 10
+ mse, scc = model.cross_validate(traindata, nr_fold)
+
def check_nu_train(self):
pass
More information about the Scipy-svn
mailing list