[Scipy-svn] r2422 - trunk/Lib/sandbox/models
scipy-svn at scipy.org
scipy-svn at scipy.org
Fri Dec 15 13:02:08 EST 2006
Author: jonathan.taylor
Date: 2006-12-15 12:02:06 -0600 (Fri, 15 Dec 2006)
New Revision: 2422
Added:
trunk/Lib/sandbox/models/smoothers.py
Log:
scatterplot smoothers
Added: trunk/Lib/sandbox/models/smoothers.py
===================================================================
--- trunk/Lib/sandbox/models/smoothers.py 2006-12-15 18:00:40 UTC (rev 2421)
+++ trunk/Lib/sandbox/models/smoothers.py 2006-12-15 18:02:06 UTC (rev 2422)
@@ -0,0 +1,257 @@
+"""
+This module contains scatterplot smoothers, that is classes
+who generate a smooth fit of a set of (x,y) pairs.
+
+"""
+
+import numpy as N
+import numpy.linalg as L
+
+from scipy.optimize import golden
+from scipy.linalg import solveh_banded
+
+from bspline import bspline
+from utils import band2array
+
+from scipy.sandbox.models import _bspline
+
+
+class poly_smoother:
+
+ """
+ Polynomial smoother up to a given order.
+ Fit based on weighted least squares.
+
+ The x values can be specified at instantiation or when called.
+ """
+
+ def df_fit(self):
+ """
+ Degrees of freedom used in the fit.
+ """
+ return self.order + 1
+
+ def df_resid(self):
+ """
+ Residual degrees of freedom from last fit.
+ """
+ return self.N - self.order - 1
+
+ def __init__(self, order, x=None):
+ self.order = order
+ self.coef = N.zeros((order+1,), N.float64)
+ if x is not None:
+ self.X = N.array([x**i for i in range(order+1)]).T
+
+ def __call__(self, x=None):
+ if x is not None:
+ X = N.array([(x**i) for i in range(self.order+1)])
+ else: X = self.X
+ return N.squeeze(N.dot(X.T, self.coef))
+
+ def fit(self, y, x=None, weights=None):
+ self.N = y.shape[0]
+ if weights is None:
+ weights = 1
+ _w = N.sqrt(weights)
+ if x is None:
+ if not hasattr(self, "X"):
+ raise ValueError, "x needed to fit poly_smoother"
+ else:
+ self.X = N.array([(x**i) for i in range(self.order+1)])
+
+ X = self.X * _w
+
+ _y = y * _w
+ self.coef = N.dot(L.pinv(X).T, _y)
+
+class smoothing_spline(bspline):
+
+ penmax = 30.
+
+ def fit(self, y, x=None, weights=None, pen=0.):
+ banded = True
+
+ if x is None:
+ x = self.tau[(self.M-1):-(self.M-1)] # internal knots
+
+ if pen == 0.: # can't use cholesky for singular matrices
+ banded = False
+
+ if x.shape != y.shape:
+ raise ValueError, 'x and y shape do not agree, by default x are the Bspline\'s internal knots'
+
+ bt = self.basis(x)
+ if pen >= self.penmax:
+ pen = self.penmax
+
+ if weights is None:
+ weights = N.array(1.)
+
+ wmean = weights.mean()
+ _w = N.sqrt(weights / wmean)
+ bt *= _w
+
+ # throw out rows with zeros (this happens at boundary points!)
+
+ mask = N.flatnonzero(1 - N.alltrue(N.equal(bt, 0), axis=0))
+
+ bt = bt[:,mask]
+ y = y[mask]
+
+ self.df_total = y.shape[0]
+
+ if bt.shape[1] != y.shape[0]:
+ raise ValueError, "some x values are outside range of B-spline knots"
+ bty = N.dot(bt, _w * y)
+ self.N = y.shape[0]
+ if not banded:
+ self.btb = N.dot(bt, bt.T)
+ _g = band2array(self.g, lower=1, symmetric=True)
+ self.coef, _, self.rank = L.lstsq(self.btb + pen*_g, bty)[0:3]
+ self.rank = min(self.rank, self.btb.shape[0])
+ else:
+ self.btb = N.zeros(self.g.shape, N.float64)
+ nband, nbasis = self.g.shape
+ for i in range(nbasis):
+ for k in range(min(nband, nbasis-i)):
+ self.btb[k,i] = (bt[i] * bt[i+k]).sum()
+
+ bty.shape = (1,bty.shape[0])
+ self.chol, self.coef = solveh_banded(self.btb +
+ pen*self.g,
+ bty, lower=1)
+
+ self.coef = N.squeeze(self.coef)
+ self.resid = N.sqrt(wmean) * (y * _w - N.dot(self.coef, bt))
+ self.pen = pen
+
+ def gcv(self):
+ """
+ Generalized cross-validation score of current fit.
+ """
+
+ norm_resid = (self.resid**2).sum()
+ return norm_resid / (self.df_total - self.trace())
+
+ def df_resid(self):
+ """
+ self.N - self.trace()
+
+ where self.N is the number of observations of last fit.
+ """
+
+ return self.N - self.trace()
+
+ def df_fit(self):
+ """
+ = self.trace()
+
+ How many degrees of freedom used in the fit?
+ """
+ return self.trace()
+
+ def trace(self):
+ """
+ Trace of the smoothing matrix S(pen)
+ """
+
+ if self.pen > 0:
+ _invband = _bspline.invband(self.chol.copy())
+ tr = _trace_symbanded(_invband, self.btb, lower=1)
+ return tr
+ else:
+ return self.rank
+
+class smoothing_spline_fixeddf(smoothing_spline):
+
+ """
+ Fit smoothing spline with approximately df degrees of freedom
+ used in the fit, i.e. so that self.trace() is approximately df.
+
+ In general, df must be greater than the dimension of the null space
+ of the Gram inner product. For cubic smoothing splines, this means
+ that df > 2.
+
+ """
+
+ target_df = 5
+
+ def __init__(self, knots, order=4, coef=None, M=None, target_df=None):
+ if target_df is not None:
+ self.target_df = target_df
+ bspline.__init__(self, knots, order=order, coef=coef, M=M)
+ self.target_reached = False
+
+ def fit(self, y, x=None, df=None, weights=None, tol=1.0e-03):
+
+ df = df or self.target_df
+
+ apen, bpen = 0, 1.0e-03
+ olddf = y.shape[0] - self.m
+
+ if not self.target_reached:
+ while True:
+ curpen = 0.5 * (apen + bpen)
+ smoothing_spline.fit(self, y, x=x, weights=weights, pen=curpen)
+ curdf = self.trace()
+ if curdf > df:
+ apen, bpen = curpen, 2 * curpen
+ else:
+ apen, bpen = apen, curpen
+ if apen >= self.penmax:
+ raise ValueError, "penalty too large, try setting penmax higher or decreasing df"
+ if N.fabs(curdf - df) / df < tol:
+ self.target_reached = True
+ break
+ else:
+ smoothing_spline.fit(self, y, x=x, weights=weights, pen=self.pen)
+
+class smoothing_spline_gcv(smoothing_spline):
+
+ """
+ Fit smoothing spline trying to optimize GCV.
+
+ Try to find a bracketing interval for scipy.optimize.golden
+ based on bracket.
+
+ It is probably best to use target_df instead, as it is
+ sometimes difficult to find a bracketing interval.
+
+ """
+
+ def fit(self, y, x=None, weights=None, tol=1.0e-03,
+ bracket=(0,1.0e-03)):
+
+ def _gcv(pen, y, x):
+ smoothing_spline.fit(y, x=x, pen=N.exp(pen), weights=weights)
+ a = self.gcv()
+ return a
+
+ a = golden(_gcv, args=(y,x), brack=(-100,20), tol=tol)
+
+def _trace_symbanded(a,b, lower=0):
+ """
+ Compute the trace(a*b) for two upper or lower banded real symmetric matrices.
+ """
+
+ if lower:
+ t = _zero_triband(a * b, lower=1)
+ return t[0].sum() + 2 * t[1:].sum()
+ else:
+ t = _zero_triband(a * b, lower=0)
+ return t[-1].sum() + 2 * t[:-1].sum()
+
+
+
+def _zero_triband(a, lower=0):
+ """
+ Zero out unnecessary elements of a real symmetric banded matrix.
+ """
+
+ nrow, ncol = a.shape
+ if lower:
+ for i in range(nrow): a[i,(ncol-i):] = 0.
+ else:
+ for i in range(nrow): a[i,0:i] = 0.
+ return a
More information about the Scipy-svn
mailing list