[SciPy-Dev] scipy.stats: algorithm to for ticket 1493

Mon Apr 23 09:21:31 EDT 2012

On Sun, Apr 22, 2012 at 8:44 PM,  <josef.pktd at gmail.com> wrote:
> On Sun, Apr 22, 2012 at 8:27 PM,  <josef.pktd at gmail.com> wrote:
>> On Sun, Apr 22, 2012 at 2:42 PM, nicky van foreest <vanforeest at gmail.com> wrote:
>>> I just realized, xa may be too large... hence we should search such
>>> that cdf(left) < q < cdf(right).
>>>
>>> *Assuming* that xa < 0 and xb > 0 the following should be better
>>>
>>> def findppf(q):
>>>    # search  until cdf(left) < q < cdf(right)
>>>    left, right = invnorm.xa, invnorm.xb
>>>    while invnorm.cdf(left, 7.24000019602, scale=2.51913630166) > q:
>>>        right = left
>>>        left *= 2
>>>    while invnorm.cdf(right, 7.24000019602, scale=2.51913630166) < q:
>>>        left = right
>>>        right *= 2
>>>    return optimize.brentq(lambda x: \
>>>                           invnorm.cdf(x, 7.24000019602,
>>> scale=2.51913630166) - q,\
>>>                           left, right)
>>>
>>> Should a test on xa < 0 and xb>0 be added?
>>
>> for xa, xb it doesn't matter whether they are larger or smaller than
>> zero, so I don't think we need a special check
>>
>> it looks good in a few more example cases.
>>
>> The difficult cases will be where cdf also doesn't exist and we need
>> to get it through integrate.quad, but I don't remember which
>> distribution is a good case.
>> There is a testcase in the test suite, where I tried to roundtrip
>> close to the 0, 1 boundary before running into failures with some
>> distributions
>> https://github.com/scipy/scipy/blob/master/scipy/stats/tests/test_continuous_basic.py#L307
>>
>> to try out how well tyour solution works, the roundtrip could be done
>> with, for example, q= [1e-8, 1-1e-8] and see at which distribution it
>> breaks and why (if any)
>
> I might not have been clear. I think the patch is good, and a good
> improvement over the current situation with fixed xa, xb.
> I just think that we are not able to reach the q=0, q=1 boundaries,
> since for some distributions we will run into other numerical
> problems. And I'm curious how far we can get with this.
>
> I don't have much of an opinion about the factor, times 2 or larger.
> Similarly, I don't know whether the default xa and xb are good. I
> changed them for a few distributions, but only where I saw obvious
> improvements.
> (There is a similar expansion of the trial space in the discrete
> distributions where I also was just guessing how fast to go and when
> to stop.)

I thought that using some adaptive heuristic to expand the trial
interval might end up doing something similar to fsolve.
However, your solution is faster than using fsolve.

Just a quick timing in the adjusted script:
brentq is much faster than fsolve for invgauss, and only a little bit
slower for cauchy (fat tails)

Josef

>
> Josef
>
>>
>> Note: I removed the scale in your example, because internal _ppf works
>> on the standard distribution, loc=0, scale=1. loc and scale are added
>> generically in .ppf
>>
>> Thanks,
>>
>> Josef
>>
>>
>> from scipy import stats, optimize
>>
>> def findppf(dist, q, *args):
>>    # search  until cdf(left) < q < cdf(right)
>>
>>    left, right = dist.xa, dist.xb
>>
>>    counter = 0
>>    while dist.cdf(left, *args) > q:
>>        right = left
>>        left *= 2
>>        counter += 1
>>        print counter, left, right
>>
>>    while dist.cdf(right, *args) < q:
>>        left = right
>>        right *= 2
>>        counter += 1
>>        print counter, left, right
>>
>>    return optimize.brentq(lambda x: dist.cdf(x, *args) - q, left, right)
>>
>> print
>> print 'invgauss'
>> s = 7.24000019602
>> sol =  findppf(stats.invgauss, 0.8455, s)
>> print sol
>> sol = findppf(stats.invgauss, 1-1e-8, s)
>> print 'roundtrip', 1-1e-8, sol, stats.invgauss.cdf(sol, s)
>> print 1e-30, stats.invgauss.cdf(findppf(stats.invgauss, 1e-30, s), s)
>>
>> print '\nt'
>> print  findppf(stats.t, 1-1e-8, s), stats.t.ppf(1-1e-8, s)
>> print  findppf(stats.t, 1e-8, s), stats.t.ppf(1e-8, s)
>> print '\ncauchy'
>> print  findppf(stats.cauchy, 1e-8), stats.cauchy.ppf(1e-8)
>> print '\nf'
>> print findppf(stats.f, 1-1e-8, 2, 10), stats.f.ppf(1-1e-8, 2, 10)
>>
>>
>>
>>
>>> _______________________________________________
>>> SciPy-Dev mailing list
>>> SciPy-Dev at scipy.org
>>> http://mail.scipy.org/mailman/listinfo/scipy-dev
-------------- next part --------------
# -*- coding: utf-8 -*-
"""

Created on Sun Apr 22 19:39:34 2012

Author: Nicky van Foreest, Josef Perktold

"""

from scipy import stats, optimize

def findppf(dist, q, *args):
    # search  until cdf(left) < q < cdf(right)

    left, right = dist.xa, dist.xb

    #counter = 0
    while dist.cdf(left, *args) > q:
        right = left
        left *= 2
        #counter += 1
        #print counter, left, right

    while dist.cdf(right, *args) < q:
        left = right
        right *= 2
        #counter += 1
        #print counter, left, right

    return optimize.brentq(lambda x: dist.cdf(x, *args) - q, left, right)

def findppf2(dist, q, *args):
    # search  until cdf(left) < q < cdf(right)
    left, right = dist.xa, dist.xb

    if dist.cdf(left, *args) > q:
        start = left
    elif dist.cdf(right, *args) < q:
        start = right
    else:
        return optimize.brentq(lambda x: dist.cdf(x, *args) - q, left, right)
    return optimize.fsolve(lambda x: dist.cdf(x, *args) - q, start)

print
print 'invgauss'
s = 7.24000019602
sol =  findppf(stats.invgauss, 0.8455, s)
print sol
sol = findppf(stats.invgauss, 1-1e-8, s)
print 'roundtrip', 1-1e-8, sol, stats.invgauss.cdf(sol, s)
print 1e-30, stats.invgauss.cdf(findppf(stats.invgauss, 1e-30, s), s)

print '\nt'
print  findppf(stats.t, 1-1e-8, s), stats.t.ppf(1-1e-8, s)
print  findppf(stats.t, 1e-8, s), stats.t.ppf(1e-8, s)
print '\ncauchy'
print  findppf(stats.cauchy, 1e-8), stats.cauchy.ppf(1e-8)
print  findppf2(stats.cauchy, 1e-8), stats.cauchy.ppf(1e-8)
print '\nf'
print findppf(stats.f, 1-1e-8, 2, 10), stats.f.ppf(1-1e-8, 2, 10)

import time
n_repl = 200
s = 10
t0 = time.time()
for _ in xrange(n_repl):
    findppf(stats.invgauss, 1 - 1e-8, s)
    findppf(stats.cauchy, 1e-8)
t1 = time.time()
for _ in xrange(n_repl):
    findppf2(stats.invgauss, 1 - 1e-8, s)
    findppf2(stats.cauchy, 1e-8)
t2 = time.time()

print 'time', t1-t0, t2-t1