[Scipy-svn] r5213 - in trunk/scipy/stats: . tests
scipy-svn at scipy.org
scipy-svn at scipy.org
Sun Nov 30 23:08:11 EST 2008
Author: josef
Date: 2008-11-30 22:08:07 -0600 (Sun, 30 Nov 2008)
New Revision: 5213
Modified:
trunk/scipy/stats/stats.py
trunk/scipy/stats/tests/test_discrete_basic.py
trunk/scipy/stats/tests/test_stats.py
Log:
correct and enhance stats.kstest, see ticket:395
Modified: trunk/scipy/stats/stats.py
===================================================================
--- trunk/scipy/stats/stats.py 2008-12-01 03:54:14 UTC (rev 5212)
+++ trunk/scipy/stats/stats.py 2008-12-01 04:08:07 UTC (rev 5213)
@@ -1946,23 +1946,101 @@
import scipy.stats
import distributions
-def kstest(rvs, cdf, args=(), N=20):
- """Return the D-value and the p-value for a Kolmogorov-Smirnov test of
- the null that N RV's generated by the rvs fits the cdf given the extra
- arguments. rvs needs to accept the size= keyword if a function. rvs
- can also be a vector of RVs.
+def kstest(rvs, cdf, args=(), N=20, alternative = 'unequal', mode='approx'):
+ """Return the D-value and the p-value for a Kolmogorov-Smirnov test
- cdf can be a function or a string indicating the distribution type.
- if the p-value is greater than the significance level (say 5%), then we
+ This performs a test of the distribution of random variables G(x) against
+ a given distribution F(x). Under the null hypothesis the two distributions
+ are identical, G(x)=F(x). The alternative hypothesis can be either
+ 'unequal' (default), 'smaller' or 'larger'. In the two one-sided test,
+ the alternative is that the empirical cumulative distribution function,
+ of the random variable is "smaller" or "larger" then the cumulative
+ distribution function of the hypothesis F(x), G(x)<=F(x), resp. G(x)>=F(x).
+
+ If the p-value is greater than the significance level (say 5%), then we
cannot reject the hypothesis that the data come from the given
distribution.
+
+ Parameters
+ ----------
+ rvs : string or array or callable
+ string: name of a distribution in scipy.stats
+ array: random variables
+ callable: function to generate random variables,
+ requires keyword argument size
+ cdf : string or callable
+ string: name of a distribution in scipy.stats
+ if rvs is a string then cdf can evaluate to False
+ or be the same as rvs
+ callable: function to evaluate cdf
+
+ args : distribution parameters used if rvs or cdf are strings
+ N : sample size if rvs is string or callable
+ alternative : 'unequal' (default), 'smaller' or 'larger'
+ defines the alternative hypothesis (see explanation)
+ mode : 'approx' (default) or 'asymp'
+ defines distribution used for calculating p-value
+ 'approx' : use approximation to exact distribution of test statistic
+ 'asymp' : use asymptotic distribution of test statistic
+
+
+ Returns
+ -------
+ D: test statistic either D, D+ or D-
+ p-value
+
+ Examples
+ --------
+
+ >>> x = np.linspace(-15,15,9)
+ >>> kstest(x,'norm')
+ (0.44435602715924361, 0.038850142705171065)
+
+ >>> np.random.seed(987654321)
+ >>> kstest('norm','',N=100)
+ (0.058352892479417884, 0.88531190944151261)
+
+ is equivalent to this
+ >>> np.random.seed(987654321)
+ >>> kstest(stats.norm.rvs(size=100),'norm')
+ (0.058352892479417884, 0.88531190944151261)
+
+ test against one-sided alternative hypothesis
+ ---------------------------------------------
+ >>> np.random.seed(987654321)
+ >>> # shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x)
+ >>> x = stats.norm.rvs(loc=0.2, size=100)
+ >>> kstest(x,'norm', alternative = 'smaller')
+ (0.12464329735846891, 0.040989164077641749)
+ >>> # reject equal distribution against alternative hypothesis: smaller
+ >>> kstest(x,'norm', alternative = 'larger')
+ (0.0072115233216311081, 0.98531158590396395)
+ >>> # don't reject equal distribution against alternative hypothesis: larger
+ >>> kstest(x,'norm', mode='asymp')
+ (0.12464329735846891, 0.08944488871182088)
+
+
+ testing t distributed random variables against normal distribution
+ ------------------------------------------------------------------
+ >>> np.random.seed(987654321)
+ >>> stats.kstest(stats.t.rvs(100,size=100),'norm')
+ (0.062018929165471248, 0.44505373063343567)
+ >>> np.random.seed(987654321)
+ >>> stats.kstest(stats.t.rvs(3,size=100),'norm')
+ (0.12101689575982888, 0.049143106661937996)
"""
if isinstance(rvs, basestring):
- cdf = getattr(scipy.stats, rvs).cdf
- rvs = getattr(scipy.stats, rvs).rvs
+ #cdf = getattr(stats, rvs).cdf
+ if (not cdf) or (cdf == rvs):
+ cdf = getattr(distributions, rvs).cdf
+ rvs = getattr(distributions, rvs).rvs
+ else:
+ raise AttributeError, 'if rvs is string, cdf has to be the same distribution'
+
+
if isinstance(cdf, basestring):
- cdf = getattr(scipy.stats, cdf).cdf
+ cdf = getattr(distributions, cdf).cdf
if callable(rvs):
kwds = {'size':N}
vals = np.sort(rvs(*args,**kwds))
@@ -1970,11 +2048,27 @@
vals = np.sort(rvs)
N = len(vals)
cdfvals = cdf(vals, *args)
- D1 = np.absolute(cdfvals - np.arange(1.0, N+1)/N).max()
-# D2 = amax(abs(cdfvals - arange(0.0,N)/N))
-# D = max(D1,D2)
- D = D1
- return D, distributions.ksone.sf(D,N)
+
+ if alternative in ['unequal', 'larger']:
+ Dplus = (np.arange(1.0, N+1)/N - cdfvals).max()
+ if alternative == 'larger':
+ return Dplus, distributions.ksone.sf(Dplus,N)
+
+ if alternative in ['unequal', 'smaller']:
+ Dmin = (cdfvals - np.arange(0.0, N)/N).max()
+ if alternative == 'smaller':
+ return Dmin, distributions.ksone.sf(Dmin,N)
+
+ if alternative == 'unequal':
+ D = np.max([Dplus,Dmin])
+ if mode == 'asymp':
+ return D, distributions.kstwobign.sf(D*np.sqrt(N))
+ if mode == 'approx':
+ pval_two = distributions.kstwobign.sf(D*np.sqrt(N))
+ if N > 2666 or pval_two > 0.80 - N*0.3/1000.0 :
+ return D, distributions.kstwobign.sf(D*np.sqrt(N))
+ else:
+ return D, distributions.ksone.sf(D,N)*2
def chisquare(f_obs, f_exp=None):
""" Calculates a one-way chi square for array of observed frequencies
Modified: trunk/scipy/stats/tests/test_discrete_basic.py
===================================================================
--- trunk/scipy/stats/tests/test_discrete_basic.py 2008-12-01 03:54:14 UTC (rev 5212)
+++ trunk/scipy/stats/tests/test_discrete_basic.py 2008-12-01 04:08:07 UTC (rev 5213)
@@ -244,12 +244,6 @@
'at arg = %s with pval = %s' % (msg,str(arg),str(pval))
-
-
-
-
-
-
if __name__ == "__main__":
#nose.run(argv=['', __file__])
nose.runmodule(argv=[__file__,'-s'], exit=False)
Modified: trunk/scipy/stats/tests/test_stats.py
===================================================================
--- trunk/scipy/stats/tests/test_stats.py 2008-12-01 03:54:14 UTC (rev 5212)
+++ trunk/scipy/stats/tests/test_stats.py 2008-12-01 04:08:07 UTC (rev 5213)
@@ -997,7 +997,32 @@
assert_array_almost_equal(stats.mstats.friedmanchisquare(x3[0],x3[1],x3[2],x3[3]),(10.68, 0.0135882729582176))
np.testing.assert_raises(ValueError,stats.mstats.friedmanchisquare,x3[0],x3[1])
+def test_kstest():
+ #from numpy.testing import assert_almost_equal
+
+ # comparing with values from R
+ x = np.linspace(-1,1,9)
+ D,p = stats.kstest(x,'norm')
+ assert_almost_equal( D, 0.15865525393145705, 12)
+ assert_almost_equal( p, 0.95164069201518386, 1)
+ x = np.linspace(-15,15,9)
+ D,p = stats.kstest(x,'norm')
+ assert_almost_equal( D, 0.44435602715924361, 15)
+ assert_almost_equal( p, 0.038850140086788665, 8)
+ # the following tests rely on deterministicaly replicated rvs
+ np.random.seed(987654321)
+ x = stats.norm.rvs(loc=0.2, size=100)
+ D,p = stats.kstest(x, 'norm', mode='asymp')
+ assert_almost_equal( D, 0.12464329735846891, 15)
+ assert_almost_equal( p, 0.089444888711820769, 15)
+ assert_almost_equal( np.array(stats.kstest(x, 'norm', mode='asymp')),
+ np.array((0.12464329735846891, 0.089444888711820769)), 15)
+ assert_almost_equal( np.array(stats.kstest(x,'norm', alternative = 'smaller')),
+ np.array((0.12464329735846891, 0.040989164077641749)), 15)
+ assert_almost_equal( np.array(stats.kstest(x,'norm', alternative = 'larger')),
+ np.array((0.0072115233216310994, 0.98531158590396228)), 14)
+
if __name__ == "__main__":
run_module_suite()
More information about the Scipy-svn
mailing list