[Scipy-svn] r2100 - in trunk/Lib/sandbox/svm: . tests
scipy-svn at scipy.org
scipy-svn at scipy.org
Thu Jul 13 20:06:32 EDT 2006
Author: fullung
Date: 2006-07-13 19:06:15 -0500 (Thu, 13 Jul 2006)
New Revision: 2100
Modified:
trunk/Lib/sandbox/svm/classification.py
trunk/Lib/sandbox/svm/dataset.py
trunk/Lib/sandbox/svm/kernel.py
trunk/Lib/sandbox/svm/model.py
trunk/Lib/sandbox/svm/regression.py
trunk/Lib/sandbox/svm/tests/test_dataset.py
Log:
Working on support for precomputed kernels.
Modified: trunk/Lib/sandbox/svm/classification.py
===================================================================
--- trunk/Lib/sandbox/svm/classification.py 2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/classification.py 2006-07-14 00:06:15 UTC (rev 2100)
@@ -130,7 +130,7 @@
This function returns the percentage of data that was
classified correctly over all the experiments.
"""
- problem = self._create_problem(dataset)
+ problem = dataset.create_svm_problem()
target = N.empty((len(dataset.data),), dtype=N.float64)
tp = cast(target.ctypes.data, POINTER(c_double))
libsvm.svm_cross_validation(problem, self.param, nr_fold, tp)
Modified: trunk/Lib/sandbox/svm/dataset.py
===================================================================
--- trunk/Lib/sandbox/svm/dataset.py 2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/dataset.py 2006-07-14 00:06:15 UTC (rev 2100)
@@ -5,6 +5,7 @@
'LibSvmTestDataSet'
]
+from ctypes import c_double, POINTER, cast
import numpy as N
import libsvm
@@ -23,33 +24,96 @@
def precompute(self, kernel):
return LibSvmPrecomputedDataSet(kernel, self.data)
+ def create_svm_problem(self):
+ problem = libsvm.svm_problem()
+ problem.l = len(self.data)
+ y = (c_double*problem.l)()
+ x = (POINTER(libsvm.svm_node)*problem.l)()
+ for i, (yi, xi) in enumerate(self.data):
+ y[i] = yi
+ x[i] = cast(xi.ctypes.data, POINTER(libsvm.svm_node))
+ problem.x = x
+ problem.y = y
+ return problem
+
class LibSvmPrecomputedDataSet:
- def __init__(self, kernel, origdata):
+ def __init__(self, kernel, origdata=None):
self.kernel = kernel
+ self.origdata = origdata
+ if origdata is None: return
- # XXX look at using a list of vectors instead of a matrix when
- # the size of the precomputed dataset gets huge. This should
- # avoid problems with heap fragmentation, especially on
- # Windows.
+ self.iddatamap = {}
+ # Create Gram matrix as a list of vectors that have extra
+ # entries for id and end of record marker.
n = len(origdata)
- # extra columns for id and end of record marker
- grammat = N.empty((n, n+2), dtype=libsvm.svm_node_dtype)
- # calculate Gram matrix
+ grammat = [N.empty((n+2,), dtype=libsvm.svm_node_dtype)
+ for i in range(n)]
+ self.grammat = grammat
+
+ # Calculate Gram matrix. Refer to Kernel::kernel_precomputed
+ # in svm.cpp to see how this precomputed setup works.
for i, (y1, x1) in enumerate(origdata):
- # set id and end of record fields
- grammat[i,0], grammat[i,-1] = (0, i), (-1, 0.0)
+ id = i + 1
+ # XXX possible numpy bug
+ #grammat[i][[0,-1]] = (0, id), (-1, 0.0)
+ grammat[i][0] = 0, id
+ grammat[i][-1] = -1, 0.0
for j, (y2, x2) in enumerate(origdata[i:]):
# Gram matrix is symmetric, so calculate dot product
# once and store it in both required locations
z = kernel(x1, x2, svm_node_dot)
# fix index so we assign to the right place
j += i
- grammat[i, j+1]['value'] = z
- grammat[j, i+1]['value'] = z
- self.grammat = grammat
- self.data = zip(map(lambda x: x[0], origdata), grammat)
+ grammat[i][j+1] = 0, z
+ grammat[j][i+1] = 0, z
+ # Map id to original vector so that we can find it again
+ # after the model has been trained. libsvm essentially
+ # provides the ids of the support vectors.
+ self.iddatamap[id] = x1
+
+ def getdata(self):
+ return zip(map(lambda x: x[0], self.origdata), self.grammat)
+ data = property(getdata)
+ def combine_inplace(self, dataset):
+ """
+ Combine this dataset with another dataset by calculating the
+ new part of the Gram matrix in place.
+ """
+ # XXX N.resize is our friend here
+ raise NotImplementedError
+
+ def combine(self, dataset):
+ """
+ Combine this dataset with another dataset by extending the
+ Gram matrix with the new inner products into a new matrix.
+ """
+ n = len(self.origdata) + len(dataset.data)
+ newgrammat = []
+
+ # copy original Gram matrix
+ for i in range(len(self.origdata)):
+ row = N.empty((n,), dtype=libsvm.svm_node_dtype)
+ row[:-1] = self.grammat[i]
+ newgrammat.append(row)
+
+ # copy id->vector map
+ newiddatamap = dict(self.iddatamap.items())
+
+ # prepare Gram matrix for new data
+ for i in range(len(dataset.data)):
+ id = i + len(self.origdata) + 1
+ row = N.empty((n,), dtype=libsvm.svm_node_dtype)
+ row[[0,-1]] = (0, id), (-1, 0.0)
+ newgrammat.append(row)
+ newiddatamap[id] = dataset.data[i][1]
+
+ newdataset = self.__class__(self.kernel)
+ newdataset.origdata = self.origdata + dataset.data
+ newdataset.iddatamap = newiddatamap
+ newdataset.grammat = newgrammat
+
class LibSvmRegressionDataSet(LibSvmDataSet):
def __init__(self, origdata):
data = map(lambda x: (x[0], convert_to_svm_node(x[1])), origdata)
@@ -75,7 +139,7 @@
def convert_to_svm_node(x):
y = N.empty(len(x)+1, dtype=libsvm.svm_node_dtype)
- y[-1] = (-1, 0.)
+ y[-1] = -1, 0.
if isinstance(x, dict):
x = x.items()
if isinstance(x, list):
Modified: trunk/Lib/sandbox/svm/kernel.py
===================================================================
--- trunk/Lib/sandbox/svm/kernel.py 2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/kernel.py 2006-07-14 00:06:15 UTC (rev 2100)
@@ -3,17 +3,24 @@
'PolynomialKernel',
'RBFKernel',
'SigmoidKernel',
- 'CustomKernel'
+ 'CustomKernel',
+ 'PrecomputedKernel'
]
import numpy as N
+import libsvm
+
class LinearKernel:
+ def __init__(self):
+ self.kernel_type = libsvm.LINEAR
+
def __call__(self, x, y, dot):
return dot(x, y)
class PolynomialKernel:
def __init__(self, degree, gamma, coef0):
+ self.kernel_type = libsvm.POLY
self.degree = degree
self.gamma = gamma
self.coef0 = coef0
@@ -31,6 +38,7 @@
class RBFKernel:
def __init__(self, gamma):
+ self.kernel_type = libsvm.RBF
self.gamma = gamma
def __call__(self, x, y, dot):
@@ -39,6 +47,7 @@
class SigmoidKernel:
def __init__(self, gamma, coef0):
+ self.kernel_type = libsvm.SIGMOID
self.gamma = gamma
self.coef0 = coef0
@@ -47,7 +56,12 @@
class CustomKernel:
def __init__(self, f):
+ self.kernel_type = libsvm.PRECOMPUTED
self.f = f
def __call__(self, x, y, dot):
return self.f(x, y, dot)
+
+class PrecomputedKernel:
+ def __init__(self):
+ self.kernel_type = libsvm.PRECOMPUTED
Modified: trunk/Lib/sandbox/svm/model.py
===================================================================
--- trunk/Lib/sandbox/svm/model.py 2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/model.py 2006-07-14 00:06:15 UTC (rev 2100)
@@ -24,65 +24,35 @@
self.shrinking = shrinking
self.cache_size = cache_size
+ # kernel parameters
param = libsvm.svm_parameter()
+ param.kernel_type = kernel.kernel_type
+ param.degree = getattr(kernel, 'degree', 0)
+ param.gamma = getattr(kernel, 'gamma', 0.0)
+ param.coef0 = getattr(kernel, 'coef0', 0.0)
- if isinstance(kernel, LinearKernel):
- param.kernel_type = libsvm.LINEAR
- elif isinstance(kernel, PolynomialKernel):
- param.kernel_type = libsvm.POLY
- param.degree = kernel.degree
- param.gamma = kernel.gamma
- param.coef0 = kernel.coef0
- elif isinstance(kernel, RBFKernel):
- param.kernel_type = libsvm.RBF
- param.gamma = kernel.gamma
- elif isinstance(kernel, SigmoidKernel):
- param.kernel_type = libsvm.SIGMOID
- param.gamma = kernel.gamma
- param.coef0 = kernel.coef0
- else:
- raise ValueError, 'unknown kernel type'
-
+ # other parameters
param.eps = tolerance
param.shrinking = shrinking
param.cache_size = cache_size
- # set defaults for optional parameters
+
+ # defaults for optional parameters
param.nr_weight = 0
- #param.weight = None
- #param.weight_label = None
- # XXX workaround for bug in ctypes 0.9.9.6
param.weight = c_double_null_ptr
param.weight_label = c_int_null_ptr
param.probability = False
self.param = param
- def _create_problem(self, dataset):
- # XXX don't poke around in dataset's internals
- problem = libsvm.svm_problem()
- problem.l = len(dataset.data)
- y = (c_double*problem.l)()
- x = (POINTER(libsvm.svm_node)*problem.l)()
- for i, (yi, xi) in enumerate(dataset.data):
- y[i] = yi
- x[i] = cast(xi.ctypes.data, POINTER(libsvm.svm_node))
- problem.x = x
- problem.y = y
- self._check_problem_param(problem, self.param)
- return problem
-
def fit(self, dataset):
- problem = self._create_problem(dataset)
+ problem = dataset.create_svm_problem()
+ self._check_problem_param(problem, self.param)
model = libsvm.svm_train(problem, self.param)
- # weight parametes are no longer required, so remove to them
- # as the data they point to might disappear when this object
- # is deallocated
+ # weights are no longer required, so remove to them as the
+ # data they point to might disappear
model.contents.param.nr_weight = 0
- # XXX workaround for bug in ctypes 0.9.9.6
- #model.contents.param.weight = None
- #model.contents.param.weight_label = None
model.contents.param.weight = c_double_null_ptr
model.contents.param.weight_label = c_int_null_ptr
Modified: trunk/Lib/sandbox/svm/regression.py
===================================================================
--- trunk/Lib/sandbox/svm/regression.py 2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/regression.py 2006-07-14 00:06:15 UTC (rev 2100)
@@ -66,7 +66,7 @@
error and the squared correlation coefficient.
"""
- problem = self._create_problem(dataset)
+ problem = dataset.create_svm_problem()
target = N.empty((len(dataset.data),), dtype=N.float64)
tp = cast(target.ctypes.data, POINTER(c_double))
libsvm.svm_cross_validation(problem, self.param, nr_fold, tp)
Modified: trunk/Lib/sandbox/svm/tests/test_dataset.py
===================================================================
--- trunk/Lib/sandbox/svm/tests/test_dataset.py 2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/tests/test_dataset.py 2006-07-14 00:06:15 UTC (rev 2100)
@@ -82,7 +82,7 @@
RBFKernel(gamma),
SigmoidKernel(gamma, coef0)
]
- y = N.random.randn(20)
+ y = N.random.randn(10)
x = N.random.randn(len(y), 10)
origdata = LibSvmRegressionDataSet(zip(y, x))
@@ -94,8 +94,24 @@
expt_grammat[i, j] = kernel(xi, xj, N.dot)
# get a new dataset containing the precomputed data
pcdata = origdata.precompute(kernel)
- actual_grammat = pcdata.grammat[:,1:-1]['value']
- assert_array_almost_equal(actual_grammat, expt_grammat)
+ for i, row in enumerate(pcdata.grammat):
+ valuerow = row[1:-1]['value']
+ assert_array_almost_equal(valuerow, expt_grammat[i])
+ def check_combine(self):
+ kernel = LinearKernel()
+
+ y1 = N.random.randn(2)
+ x1 = N.random.randn(len(y1), 2)
+ origdata = LibSvmRegressionDataSet(zip(y1, x1))
+ pcdata = origdata.precompute(kernel)
+
+ y2 = N.random.randn(1)
+ x2 = N.random.randn(len(y2), x1.shape[1])
+ moredata = LibSvmRegressionDataSet(zip(y2, x2))
+
+ #pcdata.combine(moredata)
+ #pcdata.copy_and_extend(moredata)
+
if __name__ == '__main__':
NumpyTest().run()
More information about the Scipy-svn
mailing list