[Scipy-svn] r5213 - in trunk/scipy/stats: . tests

Sun Nov 30 23:08:11 EST 2008

Author: josef
Date: 2008-11-30 22:08:07 -0600 (Sun, 30 Nov 2008)
New Revision: 5213

Modified:
   trunk/scipy/stats/stats.py
   trunk/scipy/stats/tests/test_discrete_basic.py
   trunk/scipy/stats/tests/test_stats.py
Log:
correct and enhance stats.kstest, see ticket:395

Modified: trunk/scipy/stats/stats.py
===================================================================

--- trunk/scipy/stats/stats.py	2008-12-01 03:54:14 UTC (rev 5212)
+++ trunk/scipy/stats/stats.py	2008-12-01 04:08:07 UTC (rev 5213)
@@ -1946,23 +1946,101 @@
 
 import scipy.stats
 import distributions
-def kstest(rvs, cdf, args=(), N=20):
-    """Return the D-value and the p-value for a Kolmogorov-Smirnov test of
-    the null that N RV's generated by the rvs fits the cdf given the extra
-    arguments.  rvs needs to accept the size= keyword if a function.  rvs
-    can also be a vector of RVs.
+def kstest(rvs, cdf, args=(), N=20, alternative = 'unequal', mode='approx'):
+    """Return the D-value and the p-value for a Kolmogorov-Smirnov test
 
-    cdf can be a function or a string indicating the distribution type.
 
-    if the p-value is greater than the significance level (say 5%), then we
+    This performs a test of the distribution of random variables G(x) against
+    a given distribution F(x). Under the null hypothesis the two distributions
+    are identical, G(x)=F(x). The alternative hypothesis can be either
+    'unequal' (default), 'smaller' or 'larger'. In the two one-sided test,
+    the alternative is that the empirical cumulative distribution function,
+    of the random variable is "smaller" or "larger" then the cumulative
+    distribution function of the hypothesis F(x), G(x)<=F(x), resp. G(x)>=F(x).
+
+    If the p-value is greater than the significance level (say 5%), then we
     cannot reject the hypothesis that the data come from the given
     distribution.
+
+    Parameters
+    ----------
+    rvs : string or array or callable
+        string: name of a distribution in scipy.stats
+        array: random variables
+        callable: function to generate random variables,
+                requires keyword argument  size
+    cdf : string or callable
+        string: name of a distribution in scipy.stats
+            if rvs is a string then cdf can evaluate to False
+            or be the same as rvs
+        callable: function to evaluate cdf
+
+    args : distribution parameters used if rvs or cdf are strings
+    N : sample size if rvs is string or callable
+    alternative : 'unequal' (default), 'smaller' or 'larger'
+        defines the alternative hypothesis (see explanation)
+    mode : 'approx' (default) or 'asymp'
+        defines distribution used for calculating p-value
+        'approx' : use approximation to exact distribution of test statistic
+        'asymp' : use asymptotic distribution of test statistic
+    
+
+    Returns
+    -------
+    D: test statistic either D, D+ or D-
+    p-value
+
+    Examples
+    --------
+
+    >>> x = np.linspace(-15,15,9)
+    >>> kstest(x,'norm')
+    (0.44435602715924361, 0.038850142705171065)
+
+    >>> np.random.seed(987654321)
+    >>> kstest('norm','',N=100)
+    (0.058352892479417884, 0.88531190944151261)
+
+    is equivalent to this
+    >>> np.random.seed(987654321)
+    >>> kstest(stats.norm.rvs(size=100),'norm')
+    (0.058352892479417884, 0.88531190944151261)
+
+    test against one-sided alternative hypothesis
+    ---------------------------------------------
+    >>> np.random.seed(987654321)
+    >>> # shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x)
+    >>> x = stats.norm.rvs(loc=0.2, size=100) 
+    >>> kstest(x,'norm', alternative = 'smaller') 
+    (0.12464329735846891, 0.040989164077641749)
+    >>> # reject equal distribution against alternative hypothesis: smaller
+    >>> kstest(x,'norm', alternative = 'larger')
+    (0.0072115233216311081, 0.98531158590396395)
+    >>> # don't reject equal distribution against alternative hypothesis: larger
+    >>> kstest(x,'norm', mode='asymp')
+    (0.12464329735846891, 0.08944488871182088)
+    
+
+    testing t distributed random variables against normal distribution
+    ------------------------------------------------------------------
+    >>> np.random.seed(987654321)
+    >>> stats.kstest(stats.t.rvs(100,size=100),'norm')
+    (0.062018929165471248, 0.44505373063343567)
+    >>> np.random.seed(987654321)
+    >>> stats.kstest(stats.t.rvs(3,size=100),'norm')
+    (0.12101689575982888, 0.049143106661937996)
     """
     if isinstance(rvs, basestring):
-        cdf = getattr(scipy.stats, rvs).cdf
-        rvs = getattr(scipy.stats, rvs).rvs
+        #cdf = getattr(stats, rvs).cdf
+        if (not cdf) or (cdf == rvs):
+            cdf = getattr(distributions, rvs).cdf
+            rvs = getattr(distributions, rvs).rvs       
+        else:
+            raise AttributeError, 'if rvs is string, cdf has to be the same distribution'
+        
+        
     if isinstance(cdf, basestring):
-        cdf = getattr(scipy.stats, cdf).cdf
+        cdf = getattr(distributions, cdf).cdf
     if callable(rvs):
         kwds = {'size':N}
         vals = np.sort(rvs(*args,**kwds))
@@ -1970,11 +2048,27 @@
         vals = np.sort(rvs)
         N = len(vals)
     cdfvals = cdf(vals, *args)
-    D1 = np.absolute(cdfvals - np.arange(1.0, N+1)/N).max()
-#    D2 = amax(abs(cdfvals - arange(0.0,N)/N))
-#    D = max(D1,D2)
-    D = D1
-    return D, distributions.ksone.sf(D,N)
+    
+    if alternative in ['unequal', 'larger']:
+        Dplus = (np.arange(1.0, N+1)/N - cdfvals).max()
+        if alternative == 'larger':
+            return Dplus, distributions.ksone.sf(Dplus,N)
+            
+    if alternative in ['unequal', 'smaller']:
+        Dmin = (cdfvals - np.arange(0.0, N)/N).max()
+        if alternative == 'smaller':
+            return Dmin, distributions.ksone.sf(Dmin,N)
+        
+    if alternative == 'unequal':
+        D = np.max([Dplus,Dmin])
+        if mode == 'asymp':
+             return D, distributions.kstwobign.sf(D*np.sqrt(N))
+        if mode == 'approx':
+            pval_two = distributions.kstwobign.sf(D*np.sqrt(N))
+            if N > 2666 or pval_two > 0.80 - N*0.3/1000.0 :
+                return D, distributions.kstwobign.sf(D*np.sqrt(N))
+            else:
+                return D, distributions.ksone.sf(D,N)*2
 
 def chisquare(f_obs, f_exp=None):
     """ Calculates a one-way chi square for array of observed frequencies

Modified: trunk/scipy/stats/tests/test_discrete_basic.py
===================================================================
--- trunk/scipy/stats/tests/test_discrete_basic.py	2008-12-01 03:54:14 UTC (rev 5212)
+++ trunk/scipy/stats/tests/test_discrete_basic.py	2008-12-01 04:08:07 UTC (rev 5213)
@@ -244,12 +244,6 @@
            'at arg = %s with pval = %s' % (msg,str(arg),str(pval))
 
 
-
-
-
-
-
-
 if __name__ == "__main__":
     #nose.run(argv=['', __file__])
     nose.runmodule(argv=[__file__,'-s'], exit=False)

Modified: trunk/scipy/stats/tests/test_stats.py
===================================================================
--- trunk/scipy/stats/tests/test_stats.py	2008-12-01 03:54:14 UTC (rev 5212)
+++ trunk/scipy/stats/tests/test_stats.py	2008-12-01 04:08:07 UTC (rev 5213)
@@ -997,7 +997,32 @@
     assert_array_almost_equal(stats.mstats.friedmanchisquare(x3[0],x3[1],x3[2],x3[3]),(10.68, 0.0135882729582176))
     np.testing.assert_raises(ValueError,stats.mstats.friedmanchisquare,x3[0],x3[1])
 
+def test_kstest():
+    #from numpy.testing import assert_almost_equal
+    
+    # comparing with values from R
+    x = np.linspace(-1,1,9)
+    D,p = stats.kstest(x,'norm')
+    assert_almost_equal( D, 0.15865525393145705, 12)
+    assert_almost_equal( p, 0.95164069201518386, 1)
 
+    x = np.linspace(-15,15,9)
+    D,p = stats.kstest(x,'norm')
+    assert_almost_equal( D, 0.44435602715924361, 15)
+    assert_almost_equal( p, 0.038850140086788665, 8)
 
+    # the following tests rely on deterministicaly replicated rvs
+    np.random.seed(987654321)
+    x = stats.norm.rvs(loc=0.2, size=100)
+    D,p = stats.kstest(x, 'norm', mode='asymp')
+    assert_almost_equal( D, 0.12464329735846891, 15)
+    assert_almost_equal( p, 0.089444888711820769, 15)
+    assert_almost_equal( np.array(stats.kstest(x, 'norm', mode='asymp')),
+                np.array((0.12464329735846891, 0.089444888711820769)), 15)
+    assert_almost_equal( np.array(stats.kstest(x,'norm', alternative = 'smaller')),
+                np.array((0.12464329735846891, 0.040989164077641749)), 15)
+    assert_almost_equal( np.array(stats.kstest(x,'norm', alternative = 'larger')),
+                np.array((0.0072115233216310994, 0.98531158590396228)), 14)
+
 if __name__ == "__main__":
     run_module_suite()