[Scipy-svn] r6903 - in trunk/scipy/stats: . tests
scipy-svn at scipy.org
scipy-svn at scipy.org
Wed Nov 17 09:35:26 EST 2010
Author: rgommers
Date: 2010-11-17 08:35:25 -0600 (Wed, 17 Nov 2010)
New Revision: 6903
Modified:
trunk/scipy/stats/stats.py
trunk/scipy/stats/tests/test_stats.py
Log:
ENH: Add a Fisher exact test to stats. Closes #956.
Thanks to:
David Simcha, for original implementation.
Josef, for code review and testing against R.
Modified: trunk/scipy/stats/stats.py
===================================================================
--- trunk/scipy/stats/stats.py 2010-11-16 21:18:37 UTC (rev 6902)
+++ trunk/scipy/stats/stats.py 2010-11-17 14:35:25 UTC (rev 6903)
@@ -86,6 +86,7 @@
CORRELATION FCNS: paired
pearsonr
+ fisher_exact
spearmanr
pointbiserialr
kendalltau
@@ -219,8 +220,8 @@
'obrientransform', 'samplevar', 'samplestd', 'signaltonoise',
'var', 'std', 'stderr', 'sem', 'z', 'zs', 'zmap', 'zscore',
'threshold', 'sigmaclip', 'trimboth', 'trim1', 'trim_mean',
- 'cov', 'corrcoef', 'f_oneway', 'pearsonr', 'spearmanr',
- 'pointbiserialr', 'kendalltau', 'linregress',
+ 'cov', 'corrcoef', 'f_oneway', 'pearsonr', 'fisher_exact',
+ 'spearmanr', 'pointbiserialr', 'kendalltau', 'linregress',
'ttest_1samp', 'ttest_ind', 'ttest_rel',
'kstest', 'chisquare', 'ks_2samp', 'mannwhitneyu',
'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
@@ -2373,6 +2374,127 @@
return r,prob
+def fisher_exact(c) :
+ """Performs a Fisher exact test on a 2x2 contingency table.
+
+ Parameters
+ ----------
+ c : array_like of ints
+ A 2x2 contingency table.
+
+ Returns
+ -------
+ oddsratio : float
+ This is prior odds ratio and not a posterior estimate.
+ p_value : float
+ P-value for 2-sided hypothesis of independence.
+
+ Notes
+ -----
+ The calculated odds ratio is different from the one R uses. In R language,
+ this implementation returns the (more common) "unconditional Maximum
+ Likelihood Estimate", while R uses the "conditional Maximum Likelihood
+ Estimate".
+
+ Examples
+ --------
+ >>> fisher_exact([[100, 2], [1000, 5]])
+ (0.25, 0.13007593634330314)
+ """
+ hypergeom = distributions.hypergeom
+ c = np.asarray(c, dtype=np.int64) # int32 is not enough for the algorithm
+
+ if c[1,0] > 0 and c[0,1] > 0:
+ odssratio = c[0,0] * c[1,1] / float(c[1,0] * c[0,1])
+ else:
+ odssratio = np.inf
+
+ n1 = c[0,0] + c[0,1]
+ n2 = c[1,0] + c[1,1]
+ n = c[0,0] + c[1,0]
+
+ mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2))
+ pexact = hypergeom.pmf(c[0,0], n1 + n2, n1, n)
+ pmode = hypergeom.pmf(mode, n1 + n2, n1, n)
+
+ epsilon = 1 - 1e-4
+ if float(np.abs(pexact - pmode)) / np.abs(np.max(pexact, pmode)) <= 1 - epsilon:
+ return odssratio, 1
+
+ elif c[0,0] < mode:
+ plower = hypergeom.cdf(c[0,0], n1 + n2, n1, n)
+
+ if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon:
+ return odssratio, plower
+
+ # Binary search for where to begin upper half.
+ minval = mode
+ maxval = n
+ guess = -1
+ while maxval - minval > 1:
+ if maxval == minval + 1 and guess == minval:
+ guess = maxval
+ else:
+ guess = (maxval + minval) / 2
+
+ pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
+ if pguess <= pexact and hypergeom.pmf(guess - 1, n1 + n2, n1, n) > pexact:
+ break
+ elif pguess < pexact:
+ maxval = guess
+ else:
+ minval = guess
+
+ if guess == -1:
+ guess = minval
+
+ while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
+ guess -= 1
+
+ while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
+ guess += 1
+
+ p = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n)
+ if p > 1.0:
+ p = 1.0
+ return odssratio, p
+ else:
+ pupper = hypergeom.sf(c[0,0] - 1, n1 + n2, n1, n)
+ if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon:
+ return odssratio, pupper
+
+ # Binary search for where to begin lower half.
+ minval = 0
+ maxval = mode
+ guess = -1
+ while maxval - minval > 1:
+ if maxval == minval + 1 and guess == minval:
+ guess = maxval
+ else:
+ guess = (maxval + minval) / 2
+ pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
+ if pguess <= pexact and hypergeom.pmf(guess + 1, n1 + n2, n1, n) > pexact:
+ break
+ elif pguess <= pexact:
+ minval = guess
+ else:
+ maxval = guess
+
+ if guess == -1:
+ guess = minval
+
+ while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
+ guess += 1
+
+ while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
+ guess -= 1
+
+ p = pupper + hypergeom.cdf(guess, n1 + n2, n1, n)
+ if p > 1.0:
+ p = 1.0
+ return odssratio, p
+
+
def spearmanr(a, b=None, axis=0):
"""
Calculates a Spearman rank-order correlation coefficient and the p-value
Modified: trunk/scipy/stats/tests/test_stats.py
===================================================================
--- trunk/scipy/stats/tests/test_stats.py 2010-11-16 21:18:37 UTC (rev 6902)
+++ trunk/scipy/stats/tests/test_stats.py 2010-11-17 14:35:25 UTC (rev 6903)
@@ -368,6 +368,90 @@
assert_approx_equal(r,1.0)
+def test_fisher_exact():
+ """Some tests to show that fisher_exact() works correctly.
+
+ Testing the hypergeometric survival function against R's, showing that one
+ of them (probably Scipy's) is slightly defective (see the test with
+ significant=1). This is probably because, in distributions.py, Scipy
+ uses 1.0 - cdf as the sf instead of calculating the sf more directly
+ for improved numerical accuracy.
+
+ Also note that R and Scipy have different argument formats for their
+ hypergeometric distrib functions.
+
+ R:
+ > phyper(18999, 99000, 110000, 39000, lower.tail = FALSE)
+ [1] 1.701815e-09
+ """
+ fisher_exact = stats.fisher_exact
+
+ res = fisher_exact([[18000, 80000], [20000, 90000]])[1]
+ assert_approx_equal(res, 0.2751, significant=4)
+ res = fisher_exact([[14500, 20000], [30000, 40000]])[1]
+ assert_approx_equal(res, 0.01106, significant=4)
+ res = fisher_exact([[100, 2], [1000, 5]])[1]
+ assert_approx_equal(res, 0.1301, significant=4)
+ res = fisher_exact([[2, 7], [8, 2]])[1]
+ assert_approx_equal(res, 0.0230141, significant=6)
+ res = fisher_exact([[5, 1], [10, 10]])[1]
+ assert_approx_equal(res, 0.1973244, significant=6)
+ res = fisher_exact([[5, 15], [20, 20]])[1]
+ assert_approx_equal(res, 0.0958044, significant=6)
+ res = fisher_exact([[5, 16], [20, 25]])[1]
+ assert_approx_equal(res, 0.1725862, significant=6)
+ res = fisher_exact([[10, 5], [10, 1]])[1]
+ assert_approx_equal(res, 0.1973244, significant=6)
+ res = fisher_exact([[5, 0], [1, 4]])[1]
+ assert_approx_equal(res, 0.04761904, significant=6)
+ res = fisher_exact([[0, 1], [3, 2]])[1]
+ assert_approx_equal(res, 1.0)
+ res = fisher_exact([[0, 2], [6, 4]])[1]
+ assert_approx_equal(res, 0.4545454545)
+ res = fisher_exact([[2, 7], [8, 2]])
+ assert_approx_equal(res[1], 0.0230141, significant=6)
+ assert_approx_equal(res[0], 4.0 / 56)
+
+ # High tolerance due to survival function inaccuracy.
+ res = fisher_exact([[19000, 80000], [20000, 90000]])[1]
+ assert_approx_equal(res, 3.319e-9, significant=1)
+
+ tablelist = ( [[100, 2], [1000, 5]],
+ [[2, 7], [8, 2]],
+ [[5, 1], [10, 10]],
+ [[5, 15], [20, 20]],
+ [[5, 16], [20, 25]],
+ [[10, 5], [10, 1]],
+ [[10, 5], [10, 0]],
+ [[5,0], [1, 4]],
+ [[0,5], [1, 4]],
+ [[5,1], [0, 4]],
+ [[0, 1], [3, 2]] )
+ for table in tablelist:
+ # results from R
+ #
+ # R defines oddsratio differently (see Notes section of fisher_exact
+ # docstring), so those will not match. We leave them in anyway, in
+ # case they will be useful later on. We test only the p-value.
+ tablist = [
+ ([[100, 2], [1000, 5]], (2.505583993422285e-001, 1.300759363430016e-001)),
+ ([[2, 7], [8, 2]], (8.586235135736206e-002, 2.301413756522114e-002)),
+ ([[5, 1], [10, 10]], (4.725646047336584e+000, 1.973244147157190e-001)),
+ ([[5, 15], [20, 20]], (3.394396617440852e-001, 9.580440012477637e-002)),
+ ([[5, 16], [20, 25]], (3.960558326183334e-001, 1.725864953812994e-001)),
+ ([[10, 5], [10, 1]], (2.116112781158483e-001, 1.973244147157190e-001)),
+ ([[10, 5], [10, 0]], (0.000000000000000e+000, 6.126482213438734e-002)),
+ ([[5, 0], [1, 4]], (np.inf, 4.761904761904762e-002)),
+ ([[0, 5], [1, 4]], (0.000000000000000e+000, 1.000000000000000e+000)),
+ ([[5, 1], [0, 4]], (np.inf, 4.761904761904758e-002)),
+ ([[0, 1], [3, 2]], (0.000000000000000e+000, 1.000000000000000e+000))
+ ]
+ for table, res_r in tablist:
+ res = fisher_exact(np.asarray(table))
+ np.testing.assert_almost_equal(res[1], res_r[1], decimal=11,
+ verbose=True)
+
+
class TestCorrSpearmanr(TestCase):
""" W.II.D. Compute a correlation matrix on all the variables.
More information about the Scipy-svn
mailing list