[Scipy-svn] r6903 - in trunk/scipy/stats: . tests

Wed Nov 17 09:35:26 EST 2010

Author: rgommers
Date: 2010-11-17 08:35:25 -0600 (Wed, 17 Nov 2010)
New Revision: 6903

Modified:
   trunk/scipy/stats/stats.py
   trunk/scipy/stats/tests/test_stats.py
Log:
ENH: Add a Fisher exact test to stats. Closes #956.

Thanks to:
David Simcha, for original implementation.
Josef, for code review and testing against R.

Modified: trunk/scipy/stats/stats.py
===================================================================

--- trunk/scipy/stats/stats.py	2010-11-16 21:18:37 UTC (rev 6902)
+++ trunk/scipy/stats/stats.py	2010-11-17 14:35:25 UTC (rev 6903)
@@ -86,6 +86,7 @@
 
 CORRELATION FCNS:  paired
                    pearsonr
+                   fisher_exact
                    spearmanr
                    pointbiserialr
                    kendalltau
@@ -219,8 +220,8 @@
            'obrientransform', 'samplevar', 'samplestd', 'signaltonoise',
            'var', 'std', 'stderr', 'sem', 'z', 'zs', 'zmap', 'zscore',
            'threshold', 'sigmaclip', 'trimboth', 'trim1', 'trim_mean',
-           'cov', 'corrcoef', 'f_oneway', 'pearsonr', 'spearmanr',
-           'pointbiserialr', 'kendalltau', 'linregress',
+           'cov', 'corrcoef', 'f_oneway', 'pearsonr', 'fisher_exact',
+           'spearmanr', 'pointbiserialr', 'kendalltau', 'linregress',
            'ttest_1samp', 'ttest_ind', 'ttest_rel',
            'kstest', 'chisquare', 'ks_2samp', 'mannwhitneyu',
            'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
@@ -2373,6 +2374,127 @@
     return r,prob
 
 
+def fisher_exact(c) :
+    """Performs a Fisher exact test on a 2x2 contingency table.
+
+    Parameters
+    ----------
+    c : array_like of ints
+        A 2x2 contingency table.
+
+    Returns
+    -------
+    oddsratio : float
+        This is prior odds ratio and not a posterior estimate.
+    p_value : float
+        P-value for 2-sided hypothesis of independence.
+
+    Notes
+    -----
+    The calculated odds ratio is different from the one R uses. In R language,
+    this implementation returns the (more common) "unconditional Maximum
+    Likelihood Estimate", while R uses the "conditional Maximum Likelihood
+    Estimate".
+
+    Examples
+    --------
+    >>> fisher_exact([[100, 2], [1000, 5]])
+    (0.25, 0.13007593634330314)
+    """
+    hypergeom = distributions.hypergeom
+    c = np.asarray(c, dtype=np.int64)  # int32 is not enough for the algorithm
+
+    if c[1,0] > 0 and c[0,1] > 0:
+        odssratio = c[0,0] * c[1,1] / float(c[1,0] * c[0,1])
+    else:
+        odssratio = np.inf
+
+    n1 = c[0,0] + c[0,1]
+    n2 = c[1,0] + c[1,1]
+    n  = c[0,0] + c[1,0]
+
+    mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2))
+    pexact = hypergeom.pmf(c[0,0], n1 + n2, n1, n)
+    pmode = hypergeom.pmf(mode, n1 + n2, n1, n)
+
+    epsilon = 1 - 1e-4
+    if float(np.abs(pexact - pmode)) / np.abs(np.max(pexact, pmode)) <= 1 - epsilon:
+        return odssratio, 1
+
+    elif c[0,0] < mode:
+        plower = hypergeom.cdf(c[0,0], n1 + n2, n1, n)
+
+        if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon:
+            return odssratio, plower
+
+        # Binary search for where to begin upper half.
+        minval = mode
+        maxval = n
+        guess = -1
+        while maxval - minval > 1:
+            if maxval == minval + 1 and guess == minval:
+                guess = maxval
+            else:
+                guess = (maxval + minval) / 2
+
+            pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
+            if pguess <= pexact and hypergeom.pmf(guess - 1, n1 + n2, n1, n) > pexact:
+                break
+            elif pguess < pexact:
+                maxval = guess
+            else:
+                minval = guess
+
+        if guess == -1:
+            guess = minval
+
+        while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
+            guess -= 1
+
+        while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
+            guess += 1
+
+        p = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n)
+        if p > 1.0:
+            p = 1.0
+        return odssratio, p
+    else:
+        pupper = hypergeom.sf(c[0,0] - 1, n1 + n2, n1, n)
+        if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon:
+            return odssratio, pupper
+
+        # Binary search for where to begin lower half.
+        minval = 0
+        maxval = mode
+        guess = -1
+        while maxval - minval > 1:
+            if maxval == minval + 1 and guess == minval:
+                guess = maxval
+            else:
+                guess = (maxval + minval) / 2
+            pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
+            if pguess <= pexact and hypergeom.pmf(guess + 1, n1 + n2, n1, n) > pexact:
+                break
+            elif pguess <= pexact:
+                minval = guess
+            else:
+                maxval = guess
+
+        if guess == -1:
+            guess = minval
+
+        while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
+            guess += 1
+
+        while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
+            guess -= 1
+
+        p = pupper + hypergeom.cdf(guess, n1 + n2, n1, n)
+        if p > 1.0:
+            p = 1.0
+        return odssratio, p
+
+
 def spearmanr(a, b=None, axis=0):
     """
     Calculates a Spearman rank-order correlation coefficient and the p-value

Modified: trunk/scipy/stats/tests/test_stats.py
===================================================================
--- trunk/scipy/stats/tests/test_stats.py	2010-11-16 21:18:37 UTC (rev 6902)
+++ trunk/scipy/stats/tests/test_stats.py	2010-11-17 14:35:25 UTC (rev 6903)
@@ -368,6 +368,90 @@
         assert_approx_equal(r,1.0)
 
 
+def test_fisher_exact():
+    """Some tests to show that fisher_exact() works correctly.
+
+    Testing the hypergeometric survival function against R's, showing that one
+    of them (probably Scipy's) is slightly defective (see the test with
+    significant=1).  This is probably because, in distributions.py, Scipy
+    uses 1.0 - cdf as the sf instead of calculating the sf more directly
+    for improved numerical accuracy.
+
+    Also note that R and Scipy have different argument formats for their
+    hypergeometric distrib functions.
+
+    R:
+    > phyper(18999, 99000, 110000, 39000, lower.tail = FALSE)
+    [1] 1.701815e-09
+    """
+    fisher_exact = stats.fisher_exact
+
+    res = fisher_exact([[18000, 80000], [20000, 90000]])[1]
+    assert_approx_equal(res, 0.2751, significant=4)
+    res = fisher_exact([[14500, 20000], [30000, 40000]])[1]
+    assert_approx_equal(res, 0.01106, significant=4)
+    res = fisher_exact([[100, 2], [1000, 5]])[1]
+    assert_approx_equal(res, 0.1301, significant=4)
+    res = fisher_exact([[2, 7], [8, 2]])[1]
+    assert_approx_equal(res, 0.0230141, significant=6)
+    res = fisher_exact([[5, 1], [10, 10]])[1]
+    assert_approx_equal(res, 0.1973244, significant=6)
+    res = fisher_exact([[5, 15], [20, 20]])[1]
+    assert_approx_equal(res, 0.0958044, significant=6)
+    res = fisher_exact([[5, 16], [20, 25]])[1]
+    assert_approx_equal(res, 0.1725862, significant=6)
+    res = fisher_exact([[10, 5], [10, 1]])[1]
+    assert_approx_equal(res, 0.1973244, significant=6)
+    res = fisher_exact([[5, 0], [1, 4]])[1]
+    assert_approx_equal(res, 0.04761904, significant=6)
+    res = fisher_exact([[0, 1], [3, 2]])[1]
+    assert_approx_equal(res, 1.0)
+    res = fisher_exact([[0, 2], [6, 4]])[1]
+    assert_approx_equal(res, 0.4545454545)
+    res = fisher_exact([[2, 7], [8, 2]])
+    assert_approx_equal(res[1], 0.0230141, significant=6)
+    assert_approx_equal(res[0], 4.0 / 56)
+
+    # High tolerance due to survival function inaccuracy.
+    res = fisher_exact([[19000, 80000], [20000, 90000]])[1]
+    assert_approx_equal(res, 3.319e-9, significant=1)
+
+    tablelist = ( [[100, 2], [1000, 5]],
+                  [[2, 7], [8, 2]],
+                  [[5, 1], [10, 10]],
+                  [[5, 15], [20, 20]],
+                  [[5, 16], [20, 25]],
+                  [[10, 5], [10, 1]],
+                  [[10, 5], [10, 0]],
+                  [[5,0], [1, 4]],
+                  [[0,5], [1, 4]],
+                  [[5,1], [0, 4]],
+                  [[0, 1], [3, 2]] )
+    for table in tablelist:
+        # results from R
+        #
+        # R defines oddsratio differently (see Notes section of fisher_exact
+        # docstring), so those will not match.  We leave them in anyway, in
+        # case they will be useful later on. We test only the p-value.
+        tablist = [
+            ([[100, 2], [1000, 5]], (2.505583993422285e-001,  1.300759363430016e-001)),
+            ([[2, 7], [8, 2]], (8.586235135736206e-002,  2.301413756522114e-002)),
+            ([[5, 1], [10, 10]], (4.725646047336584e+000,  1.973244147157190e-001)),
+            ([[5, 15], [20, 20]], (3.394396617440852e-001,  9.580440012477637e-002)),
+            ([[5, 16], [20, 25]], (3.960558326183334e-001,  1.725864953812994e-001)),
+            ([[10, 5], [10, 1]], (2.116112781158483e-001,  1.973244147157190e-001)),
+            ([[10, 5], [10, 0]], (0.000000000000000e+000,  6.126482213438734e-002)),
+            ([[5, 0], [1, 4]], (np.inf,  4.761904761904762e-002)),
+            ([[0, 5], [1, 4]], (0.000000000000000e+000,  1.000000000000000e+000)),
+            ([[5, 1], [0, 4]], (np.inf,  4.761904761904758e-002)),
+            ([[0, 1], [3, 2]], (0.000000000000000e+000,  1.000000000000000e+000))
+            ]
+    for table, res_r in tablist:
+        res = fisher_exact(np.asarray(table))
+        np.testing.assert_almost_equal(res[1], res_r[1], decimal=11,
+                                       verbose=True)
+
+
 class TestCorrSpearmanr(TestCase):
     """ W.II.D. Compute a correlation matrix on all the variables.