[SciPy-User] expectation function for stats.distributions

Mon Aug 3 16:50:40 EDT 2009

On Sat, Aug 1, 2009 at 5:39 PM, <josef.pktd at gmail.com> wrote:
> On Sat, Aug 1, 2009 at 4:44 PM, Pierre GM<pgmdevlist at gmail.com> wrote:
>>
>> On Jul 31, 2009, at 4:10 PM, josef.pktd at gmail.com wrote:
>>
>>> On Fri, Jul 31, 2009 at 3:10 PM, nicky van foreest<vanforeest at gmail.com
>>> > wrote:
>>>> Hi Josef,
>>>>
>>>> I would like such a function.
>>>
>>> Thanks for the interest.
>>
>> Count me in as well
>>
>>> To the name: I'm up for ideas
>>
>> rv_generic.expect ?
>
> I think that's my favorite so far.
> It will be attached to continuous and discrete separately (not to
> generic) because of the difference in implementation.
> I added loc and scale to the expectation function.
> but for discrete distributions following the pattern of _drv2_moment
> for discrete might not be the best implementation.
>

Here's an updated version, one function for continuous, one for
discrete distributions
includes loc and scale,
expect_discrete is tested with all discrete distribution in stats for
mean and variance and is very accurate except for zipf (only 1e-5)
can be used as function or attached to the distributions as method.

Josef
-------------- next part --------------
'''
copy from webpage, where is the original?

Author jpktd

'''

import numpy as np
from scipy import stats, integrate

def expect(self, fn=None, args=(), loc=0, scale=1, lb=None, ub=None, conditional=False):
    '''calculate expected value of a function with respect to the distribution

    location and scale only tested on a few examples

    Parameters
    ----------
        all parameters are keyword parameters
        fn : function (default: identity mapping)
           Function for which integral is calculated. Takes only one argument.
        args : tuple
           argument (parameters) of the distribution
        lb, ub : numbers
           lower and upper bound for integration, default is set to the support
           of the distribution
        conditional : boolean (False)
           If true then the integral is corrected by the conditional probability
           of the integration interval. The return value is the expectation
           of the function, conditional on being in the given interval.

    Returns
    -------
        expected value : float
    '''
    if fn is None:
        def fun(x, *args):
            return x*self.pdf(x, loc=loc, scale=scale,*args)
    else:
        def fun(x, *args):
            return fn(x)*self.pdf(x, loc=loc, scale=scale, *args)
    if lb is None:
        lb = (self.a - loc)/(1.0*scale)
    if ub is None:
        ub = (self.b - loc)/(1.0*scale)
    if conditional:
        invfac = self.sf(lb,*args) - self.sf(ub,*args)
    else:
        invfac = 1.0
    return integrate.quad(fun, lb, ub,
                                args=args)[0]/invfac

print expect(stats.norm, lambda(x): (x)**4)
print expect(stats.norm, lambda(x): min((x)**2,1.5**2))

#Jonathans version
from scipy.stats import norm as Gaussian
c = 1.5 # our "cutoff" point
c = 0.5 # try another value
tmp = 2 * Gaussian.cdf(c) - 1
gamma = tmp + c**2 * (1 - tmp) - 2 * c * Gaussian.pdf(c)
print gamma

print expect(stats.norm, lambda(x): min((x)**2,c**2))

print 'check loc and scale - continuous'

m = expect(stats.norm, lambda(x): (x), loc=2, scale=2)
mnc2 = expect(stats.norm, lambda(x): (x)**2, loc=2, scale=2)
v = mnc2 - m**2
print m, v
print stats.norm.stats(loc=2, scale=2)
print expect(stats.norm, lambda(x): (x)**2, scale=2)
print expect(stats.norm, lambda(x): (x), loc=2)

### for discrete distributions

#based on _drv2_moment(self, n, *args), but streamlined
def expect_discrete(self, fn=None, args=(), loc=0, lb=None, ub=None, conditional=False):
    '''calculate expected value of a function with respect to the distribution
    for discrete distribution

    Parameters
    ----------
        (self : distribution instance as defined in scipy stats)
        fn : function (default: identity mapping)
           Function for which integral is calculated. Takes only one argument.
        args : tuple
           argument (parameters) of the distribution
        optional keyword parameters
        lb, ub : numbers
           lower and upper bound for integration, default is set to the support
           of the distribution, lb and ub are inclusive (ul<=k<=ub)
        conditional : boolean (False)
           If true then the expectation is corrected by the conditional
           probability of the integration interval. The return value is the
           expectation of the function, conditional on being in the given
           interval (k such that ul<=k<=ub).

    Returns
    -------
        expected value : float

    Notes
    -----
    * function is not vectorized
    * accuracy: uses self.moment_tol as stop criteria
        for heavy tailed distribution e.g. zipf(4), accuracy for
        mean, var example is only 1e-5,
        increasing precision (moment_tol) makes zipf very slow
    * suppnmin=100 internal parameter for minimum number of points to evaluate
        could be added as keyword parameter, to evaluate functions with
        non-monotonic shapes, points include integers in (-suppnmin, suppnmin)
    * uses maxcount=1000 limits the number of points that are evaluated
        to break loop for infinite sums
        (maximum of suppnmin+1000 positive plus suppnmin+1000 negative integers
        are evaluated)

    '''

    #moment_tol = 1e-12 # increase compared to self.moment_tol,
    # too slow for only small gain in precision for zipf

    #avoid endless loop with unbound integral, eg. var of zipf(2)
    maxcount = 1000
    suppnmin = 100  #minimum number of points to evaluate (+ and -)

    if fn is None:
        def fun(x):
            #loc and args from outer scope
            return (x+loc)*self._pmf(x, *args)
    else:
        def fun(x):
            #loc and args from outer scope
            return fn(x+loc)*self._pmf(x, *args)
    # used pmf because _pmf does not check support in randint
    # and there might be problems(?) with correct self.a, self.b at this stage
    # maybe not anymore, seems to work now with _pmf

    self._argcheck(*args) # (re)generate scalar self.a and self.b
    if lb is None:
        lb = (self.a)
    if ub is None:
        ub = (self.b)
    if conditional:
        invfac = self.sf(lb,*args) - self.sf(ub+1,*args)
    else:
        invfac = 1.0

    tot = 0.0
    low, upp = self._ppf(0.001, *args), self._ppf(0.999, *args)
    low = max(min(-suppnmin, low), lb)
    upp = min(max(suppnmin, upp), ub)
    supp = np.arange(low, upp+1, self.inc) #check limits
    #print 'low, upp', low, upp
    tot = np.sum(fun(supp))
    diff = 1e100
    pos = upp + self.inc
    count = 0

    #handle cases with infinite support

    while (pos <= ub) and (diff > self.moment_tol) and count <= maxcount: 
        diff = fun(pos)
        tot += diff
        pos += self.inc
        count += 1

    if self.a < 0: #handle case when self.a = -inf
        diff = 1e100
        pos = low - self.inc
        while (pos >= lb) and (diff > self.moment_tol) and count <= maxcount: 
            diff = fun(pos)
            tot += diff
            pos -= self.inc
            count += 1
    if count > maxcount:
        # replace with proper warning
        print 'sum did not converge'
    return tot/invfac

distdiscrete = [
    ['bernoulli',(0.3,)],
    ['binom',    (5, 0.4)],
    ['boltzmann',(1.4, 19)],
    ['dlaplace', (0.8,)], #0.5
    ['geom',     (0.5,)],
    ['hypergeom',(30, 12, 6)],
    ['hypergeom',(21,3,12)],  #numpy.random (3,18,12) numpy ticket:921
    ['hypergeom',(21,18,11)],  #numpy.random (18,3,11) numpy ticket:921
    ['logser',   (0.6,)],  # reenabled, numpy ticket:921
    ['nbinom',   (5, 0.5)],
    ['nbinom',   (0.4, 0.4)], #from tickets: 583
    ['planck',   (0.51,)],   #4.1
    ['poisson',  (0.6,)],
    ['randint',  (7, 31)],
    ['zipf',     (4,)],
    ['zipf',     (8,)],
    ['zipf',     (3,)]]

distdiscrete_ = [
    ['randint',  (7, 31)]
    ]

for distname, args in distdiscrete:
    distfn = getattr(stats, distname)
    print '\n', distname, args
    m0 = expect_discrete(distfn, lambda(x): 1, args)
    m = expect_discrete(distfn, lambda(x): x, args)
    mnc2 = expect_discrete(distfn, lambda(x): x**2, args)
    v = mnc2 - m*m
    md, vd = distfn.stats(*args)

    print m0, 'm0', m0-1
    print m, v, 'm, v'
    print md[()], vd[()], 'md, vd'
    print m - md, v - vd, 'm - md, v - vd'
    if (np.abs(m-md) > distfn.moment_tol) or (np.abs(v-vd) > distfn.moment_tol):
        print '************ diff too large'
    loc = 1
    ml = expect_discrete(distfn, lambda(x): x, args, loc=loc)
    mnc2l = expect_discrete(distfn, lambda(x): x**2, args, loc=loc)
    vl = mnc2l - ml*ml
    print ml, vl, 'ml, vl'
    if (np.abs(ml-md-loc) > distfn.moment_tol) or (np.abs(vl-vd) > distfn.moment_tol):
        print '************ diff too large'

print 'check scale, limits and conditional'
print expect_discrete(stats.randint, lambda(x): x, (0,10)),
print expect_discrete(stats.randint, lambda(x): x-1, (1,11)),
print np.dot(np.arange(10),stats.randint.pmf(np.arange(10),0,10))

print expect_discrete(stats.randint, lambda(x): x, (0,5)),
print expect_discrete(stats.randint, lambda(x): x, (0,10), ub=4,
                      conditional=True)
print expect_discrete(stats.randint, lambda(x): x, (0,10), ub=4,
                      conditional=False),
print np.dot(np.arange(5),stats.randint.pmf(np.arange(5),0,10))
print expect_discrete(stats.randint, lambda(x): x, (0,10), lb=2, ub=6,
                      conditional=True),
print expect_discrete(stats.randint, lambda(x): x, (2,7))
#Note: ub is inclusive (ul<=k<=ub), max in randint is exclusive (min<=k<max)