[Scipy-svn] r2100 - in trunk/Lib/sandbox/svm: . tests

Thu Jul 13 20:06:32 EDT 2006

Author: fullung
Date: 2006-07-13 19:06:15 -0500 (Thu, 13 Jul 2006)
New Revision: 2100

Modified:
   trunk/Lib/sandbox/svm/classification.py
   trunk/Lib/sandbox/svm/dataset.py
   trunk/Lib/sandbox/svm/kernel.py
   trunk/Lib/sandbox/svm/model.py
   trunk/Lib/sandbox/svm/regression.py
   trunk/Lib/sandbox/svm/tests/test_dataset.py
Log:
Working on support for precomputed kernels.


Modified: trunk/Lib/sandbox/svm/classification.py
===================================================================

--- trunk/Lib/sandbox/svm/classification.py	2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/classification.py	2006-07-14 00:06:15 UTC (rev 2100)
@@ -130,7 +130,7 @@
         This function returns the percentage of data that was
         classified correctly over all the experiments.
         """
-        problem = self._create_problem(dataset)
+        problem = dataset.create_svm_problem()
         target = N.empty((len(dataset.data),), dtype=N.float64)
         tp = cast(target.ctypes.data, POINTER(c_double))
         libsvm.svm_cross_validation(problem, self.param, nr_fold, tp)

Modified: trunk/Lib/sandbox/svm/dataset.py
===================================================================
--- trunk/Lib/sandbox/svm/dataset.py	2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/dataset.py	2006-07-14 00:06:15 UTC (rev 2100)
@@ -5,6 +5,7 @@
     'LibSvmTestDataSet'
     ]
 
+from ctypes import c_double, POINTER, cast
 import numpy as N
 
 import libsvm
@@ -23,33 +24,96 @@
     def precompute(self, kernel):
         return LibSvmPrecomputedDataSet(kernel, self.data)
 
+    def create_svm_problem(self):
+        problem = libsvm.svm_problem()
+        problem.l = len(self.data)
+        y = (c_double*problem.l)()
+        x = (POINTER(libsvm.svm_node)*problem.l)()
+        for i, (yi, xi) in enumerate(self.data):
+            y[i] = yi
+            x[i] = cast(xi.ctypes.data, POINTER(libsvm.svm_node))
+        problem.x = x
+        problem.y = y
+        return problem
+
 class LibSvmPrecomputedDataSet:
-    def __init__(self, kernel, origdata):
+    def __init__(self, kernel, origdata=None):
         self.kernel = kernel
+        self.origdata = origdata
+        if origdata is None: return
 
-        # XXX look at using a list of vectors instead of a matrix when
-        # the size of the precomputed dataset gets huge. This should
-        # avoid problems with heap fragmentation, especially on
-        # Windows.
+        self.iddatamap = {}
 
+        # Create Gram matrix as a list of vectors that have extra
+        # entries for id and end of record marker.
         n = len(origdata)
-        # extra columns for id and end of record marker
-        grammat = N.empty((n, n+2), dtype=libsvm.svm_node_dtype)
-        # calculate Gram matrix
+        grammat = [N.empty((n+2,), dtype=libsvm.svm_node_dtype)
+                   for i in range(n)]
+        self.grammat = grammat
+
+        # Calculate Gram matrix. Refer to Kernel::kernel_precomputed
+        # in svm.cpp to see how this precomputed setup works.
         for i, (y1, x1) in enumerate(origdata):
-            # set id and end of record fields
-            grammat[i,0], grammat[i,-1] = (0, i), (-1, 0.0)
+            id = i + 1
+            # XXX possible numpy bug
+            #grammat[i][[0,-1]] = (0, id), (-1, 0.0)
+            grammat[i][0] = 0, id
+            grammat[i][-1] = -1, 0.0
             for j, (y2, x2) in enumerate(origdata[i:]):
                 # Gram matrix is symmetric, so calculate dot product
                 # once and store it in both required locations
                 z = kernel(x1, x2, svm_node_dot)
                 # fix index so we assign to the right place
                 j += i
-                grammat[i, j+1]['value'] = z
-                grammat[j, i+1]['value'] = z
-        self.grammat = grammat
-        self.data = zip(map(lambda x: x[0], origdata), grammat)
+                grammat[i][j+1] = 0, z
+                grammat[j][i+1] = 0, z
+            # Map id to original vector so that we can find it again
+            # after the model has been trained. libsvm essentially
+            # provides the ids of the support vectors.
+            self.iddatamap[id] = x1
+    
+    def getdata(self):
+        return zip(map(lambda x: x[0], self.origdata), self.grammat)
+    data = property(getdata)
 
+    def combine_inplace(self, dataset):
+        """
+        Combine this dataset with another dataset by calculating the
+        new part of the Gram matrix in place.
+        """
+        # XXX N.resize is our friend here
+        raise NotImplementedError
+
+    def combine(self, dataset):
+        """
+        Combine this dataset with another dataset by extending the
+        Gram matrix with the new inner products into a new matrix.
+        """
+        n = len(self.origdata) + len(dataset.data)
+        newgrammat = []
+
+        # copy original Gram matrix
+        for i in range(len(self.origdata)):
+            row = N.empty((n,), dtype=libsvm.svm_node_dtype)
+            row[:-1] = self.grammat[i]
+            newgrammat.append(row)
+
+        # copy id->vector map
+        newiddatamap = dict(self.iddatamap.items())
+
+        # prepare Gram matrix for new data
+        for i in range(len(dataset.data)):
+            id = i + len(self.origdata) + 1
+            row = N.empty((n,), dtype=libsvm.svm_node_dtype)
+            row[[0,-1]] = (0, id), (-1, 0.0)
+            newgrammat.append(row)
+            newiddatamap[id] = dataset.data[i][1]
+
+        newdataset = self.__class__(self.kernel)
+        newdataset.origdata = self.origdata + dataset.data
+        newdataset.iddatamap = newiddatamap
+        newdataset.grammat = newgrammat
+
 class LibSvmRegressionDataSet(LibSvmDataSet):
     def __init__(self, origdata):
         data = map(lambda x: (x[0], convert_to_svm_node(x[1])), origdata)
@@ -75,7 +139,7 @@
 
 def convert_to_svm_node(x):
     y = N.empty(len(x)+1, dtype=libsvm.svm_node_dtype)
-    y[-1] = (-1, 0.)
+    y[-1] = -1, 0.
     if isinstance(x, dict):
         x = x.items()
     if isinstance(x, list):

Modified: trunk/Lib/sandbox/svm/kernel.py
===================================================================
--- trunk/Lib/sandbox/svm/kernel.py	2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/kernel.py	2006-07-14 00:06:15 UTC (rev 2100)
@@ -3,17 +3,24 @@
     'PolynomialKernel',
     'RBFKernel',
     'SigmoidKernel',
-    'CustomKernel'
+    'CustomKernel',
+    'PrecomputedKernel'
     ]
 
 import numpy as N
 
+import libsvm
+
 class LinearKernel:
+    def __init__(self):
+        self.kernel_type = libsvm.LINEAR
+
     def __call__(self, x, y, dot):
         return dot(x, y)
 
 class PolynomialKernel:
     def __init__(self, degree, gamma, coef0):
+        self.kernel_type = libsvm.POLY
         self.degree = degree
         self.gamma = gamma
         self.coef0 = coef0
@@ -31,6 +38,7 @@
 
 class RBFKernel:
     def __init__(self, gamma):
+        self.kernel_type = libsvm.RBF
         self.gamma = gamma
 
     def __call__(self, x, y, dot):
@@ -39,6 +47,7 @@
 
 class SigmoidKernel:
     def __init__(self, gamma, coef0):
+        self.kernel_type = libsvm.SIGMOID
         self.gamma = gamma
         self.coef0 = coef0
 
@@ -47,7 +56,12 @@
 
 class CustomKernel:
     def __init__(self, f):
+        self.kernel_type = libsvm.PRECOMPUTED
         self.f = f
 
     def __call__(self, x, y, dot):
         return self.f(x, y, dot)
+
+class PrecomputedKernel:
+    def __init__(self):
+        self.kernel_type = libsvm.PRECOMPUTED

Modified: trunk/Lib/sandbox/svm/model.py
===================================================================
--- trunk/Lib/sandbox/svm/model.py	2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/model.py	2006-07-14 00:06:15 UTC (rev 2100)
@@ -24,65 +24,35 @@
         self.shrinking = shrinking
         self.cache_size = cache_size
 
+        # kernel parameters
         param = libsvm.svm_parameter()
+        param.kernel_type = kernel.kernel_type
+        param.degree = getattr(kernel, 'degree', 0)
+        param.gamma = getattr(kernel, 'gamma', 0.0)
+        param.coef0 = getattr(kernel, 'coef0', 0.0)
 
-        if isinstance(kernel, LinearKernel):
-            param.kernel_type = libsvm.LINEAR
-        elif isinstance(kernel, PolynomialKernel):
-            param.kernel_type = libsvm.POLY
-            param.degree = kernel.degree
-            param.gamma = kernel.gamma
-            param.coef0 = kernel.coef0
-        elif isinstance(kernel, RBFKernel):
-            param.kernel_type = libsvm.RBF
-            param.gamma = kernel.gamma
-        elif isinstance(kernel, SigmoidKernel):
-            param.kernel_type = libsvm.SIGMOID
-            param.gamma = kernel.gamma
-            param.coef0 = kernel.coef0
-        else:
-            raise ValueError, 'unknown kernel type'
-
+        # other parameters
         param.eps = tolerance
         param.shrinking = shrinking
         param.cache_size = cache_size
-        # set defaults for optional parameters
+
+        # defaults for optional parameters
         param.nr_weight = 0
-        #param.weight = None
-        #param.weight_label = None
-        # XXX workaround for bug in ctypes 0.9.9.6
         param.weight = c_double_null_ptr
         param.weight_label = c_int_null_ptr
         param.probability = False
 
         self.param = param
 
-    def _create_problem(self, dataset):
-        # XXX don't poke around in dataset's internals
-        problem = libsvm.svm_problem()
-        problem.l = len(dataset.data)
-        y = (c_double*problem.l)()
-        x = (POINTER(libsvm.svm_node)*problem.l)()
-        for i, (yi, xi) in enumerate(dataset.data):
-            y[i] = yi
-            x[i] = cast(xi.ctypes.data, POINTER(libsvm.svm_node))
-        problem.x = x
-        problem.y = y
-        self._check_problem_param(problem, self.param)
-        return problem
-
     def fit(self, dataset):
-        problem = self._create_problem(dataset)
+        problem = dataset.create_svm_problem()
 
+        self._check_problem_param(problem, self.param)
         model = libsvm.svm_train(problem, self.param)
 
-        # weight parametes are no longer required, so remove to them
-        # as the data they point to might disappear when this object
-        # is deallocated
+        # weights are no longer required, so remove to them as the
+        # data they point to might disappear
         model.contents.param.nr_weight = 0
-        # XXX workaround for bug in ctypes 0.9.9.6
-        #model.contents.param.weight = None
-        #model.contents.param.weight_label = None
         model.contents.param.weight = c_double_null_ptr
         model.contents.param.weight_label = c_int_null_ptr
 

Modified: trunk/Lib/sandbox/svm/regression.py
===================================================================
--- trunk/Lib/sandbox/svm/regression.py	2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/regression.py	2006-07-14 00:06:15 UTC (rev 2100)
@@ -66,7 +66,7 @@
         error and the squared correlation coefficient.
         """
 
-        problem = self._create_problem(dataset)
+        problem = dataset.create_svm_problem()
         target = N.empty((len(dataset.data),), dtype=N.float64)
         tp = cast(target.ctypes.data, POINTER(c_double))
         libsvm.svm_cross_validation(problem, self.param, nr_fold, tp)

Modified: trunk/Lib/sandbox/svm/tests/test_dataset.py
===================================================================
--- trunk/Lib/sandbox/svm/tests/test_dataset.py	2006-07-13 14:45:21 UTC (rev 2099)
+++ trunk/Lib/sandbox/svm/tests/test_dataset.py	2006-07-14 00:06:15 UTC (rev 2100)
@@ -82,7 +82,7 @@
             RBFKernel(gamma),
             SigmoidKernel(gamma, coef0)
             ]
-        y = N.random.randn(20)
+        y = N.random.randn(10)
         x = N.random.randn(len(y), 10)
         origdata = LibSvmRegressionDataSet(zip(y, x))
 
@@ -94,8 +94,24 @@
                     expt_grammat[i, j] = kernel(xi, xj, N.dot)
             # get a new dataset containing the precomputed data
             pcdata = origdata.precompute(kernel)
-            actual_grammat = pcdata.grammat[:,1:-1]['value']
-            assert_array_almost_equal(actual_grammat, expt_grammat)
+            for i, row in enumerate(pcdata.grammat):
+                valuerow = row[1:-1]['value']
+                assert_array_almost_equal(valuerow, expt_grammat[i])
 
+    def check_combine(self):
+        kernel = LinearKernel()
+
+        y1 = N.random.randn(2)
+        x1 = N.random.randn(len(y1), 2)
+        origdata = LibSvmRegressionDataSet(zip(y1, x1))
+        pcdata = origdata.precompute(kernel)
+
+        y2 = N.random.randn(1)
+        x2 = N.random.randn(len(y2), x1.shape[1])
+        moredata = LibSvmRegressionDataSet(zip(y2, x2))
+
+        #pcdata.combine(moredata)
+        #pcdata.copy_and_extend(moredata)
+
 if __name__ == '__main__':
     NumpyTest().run()