Sampling a population

Fri Jun 2 15:09:54 EDT 2006

Brian Quinlan wrote:
> This is less a Python question and more a optimization/probability
> question. Imaging that you have a list of objects and there frequency in
> a population e.g.
>
> lst = [(a, 0.01), (b, 0.05), (c, 0.50), (d, 0.30), (e, 0.04), (f, 0.10)]
>
> and you want to drawn n items from that list (duplicates allowed), with
> that probability distribution.
>
> The fastest algorithm that I have been able to devise for doing so is:
> O(n * log(len(lst))). Can anyone think or a solution with a better time
> complexity? If not, is there an obviously better way to do this
> (assuming n is big and the list size is small).
>

Any way I tried to slice and dice it, I could not get any faster.
draw2 and draw 3 generate code on the fly. draw4 sneakily tries to
trade memory and accuracy for speed but is even slower!

First the times, then the code:

$ ./timeit.py 'from probDistribution import draw as draw; draw(10000,
[0.01, 0.05, 0.50, 0.30, 0.04, 0.10])'
100 loops, best of 3: 13.4 msec per loop

$ ./timeit.py 'from probDistribution import draw2 as draw; draw(10000,
[0.01, 0.05, 0.50, 0.30, 0.04, 0.10])'
100 loops, best of 3: 15.2 msec per loop

$ ./timeit.py 'from probDistribution import draw3 as draw; draw(10000,
[0.01, 0.05, 0.50, 0.30, 0.04, 0.10])'
100 loops, best of 3: 16.2 msec per loop

$ ./timeit.py 'from probDistribution import draw4 as draw; draw(10000,
[0.01, 0.05, 0.50, 0.30, 0.04, 0.10])'
10 loops, best of 3: 30.5 msec per loop

=== CODE probDistribution.py ===
from random import random, randrange
from bisect import bisect

def draw(n, lst):
     ps = []
     last = 0
     for p in lst:
         ps.append(last + p)
         last += p

     # ps = [0.01, 0.06, 0.56, 0.86, 0.90, 1.00]

     chosen = [0] * len(lst) # track frequency
     for i in range(n):
         r = random()

         chosen[bisect(ps, r)] += 1 # binary search and increment

     result = [] # rescale probability based on frequency
     for c in chosen:
         result.append(float(c) / n)
     return result

def draw2(n, lst):
    """ uses dynamicc code generation of this form:
chosen = [0] * 6
for i in xrange(10000):
    r = random()
    if r < 0.01: chosen[0]+=1
    elif r < 0.06: chosen[1] +=1
    ...
    elif r < 0.90: chosen[4] +=1
    else chosen[5]+=1
    """
    assert len(lst)>1, "Corner case NOT covered"

    codestr = 'chosen = [0] * %i\n' % (len(lst),)
    codestr += 'for i in xrange(%i):\n  r = random()\n' % (n,)

    last = 0.0
    lstmax = len(lst)-1
    for i,p in enumerate(lst):
        last += p
        if i==0:        codestr += '  if r < %g: chosen[%i] +=1\n' %
(last, i)
        elif i==lstmax: codestr += '  else: chosen[%i] +=1\n' % (i,)
        else:           codestr += '  elif r < %g: chosen[%i] +=1\n' %
(last, i)

    exec codestr

    result = [] # rescale probability based on frequency
    for c in chosen:
        result.append(float(c) / n)
    return result

def draw3(n, lst):
    """ uses dynamicc code generation of this form:
chosen = [0] * 6
for i in xrange(10000):
    r = random()
    chosen[-1+ (
      ((r<0.01) and 1)
      or ((r<0.06) and 2)
      ...
      or ((r<0.90) and 5)
      or 6
      )] +=1

    """
    assert len(lst)>1, "Corner case NOT covered"

    codestr = 'chosen = [0] * %i\n' % (len(lst),)
    codestr += 'for i in xrange(%i):\n  r = random()\n' % (n,)
    codestr += '  chosen[-1+ (\n'

    last = 0.0
    lstmax = len(lst)-1
    for i,p in enumerate(lst):
        last += p
        if i==0:        codestr += '    ((r<%g) and 1)\n' % (last)
        elif i==lstmax: codestr += '    or %i\n    )] +=1\n' % (i+1,)
        else:           codestr += '    or ((r<%g) and %i)\n' % (last,
i+1)
    #return codestr
    exec codestr

    result = [] # rescale probability based on frequency
    for c in chosen:
        result.append(float(c) / n)
    return result

def draw4(n, lst, precision = 0.01, maxbins=10000):
    """ Memory/speed tradeoff by coarse quantizing of frequency values.

    """
    assert len(lst)>1, "Corner case NOT covered"
    assert 0.0 < precision < 1.0 and (1.0/precision) < maxbins

    binmax = int(1.0/precision)
    chosenbin = [0] * binmax
    for i in xrange(n):
        chosenbin[randrange(binmax)] +=1

    left, right = 0, 0  # extract bin range for summation
    chosen = [0] * len(lst)
    last = 0.0
    for i,p in enumerate(lst):
        last += p
        right = int(last/precision)
        chosen[i] = sum( chosenbin[left:right] )
        left = right

    result = [] # rescale probability based on frequency
    for c in chosen:
        result.append(float(c) / n)
    return result

=== END CODE ===