Monte Carlo Method and pi

Mon Jul 12 23:26:32 EDT 2004

Tim Hochberg <tim.hochberg at ieee.org> writes:

> I can more than double the speed of this under psyco by replacing **2
> with x*x. I have inside information that pow is extra slow on psyco(*)

Thanks for pointing this out.  This change sped up all of the
methods I've tried.  It's too bad that Python does optimize this,
but I case pow could be redefined at runtime.

> (*) Psyco's support for floating point operations is considerable
> slower than its support for integer operations. The latter are
> optimized all the way to assembly when possible, but the latter are,
> at best, optimized into C function calls that perform the
> operation.

That's unfortunate.

> Thus there is always some function call overhead. Fixing
> this would require someone who groks both the guts of Psyco and the
> intracies of x86 floating point machine code to step forward. 

If someone volunteers, that would be great.  I don't know anything
about x86 floating point;  is it a real mess?

> If one takes into the accout the speed difference of the two CPUs this
> puts the both the numarray and psyco solutions within about 50% of the
> Fortran solution, which I'm impressed by. 

I've now run a bunch of timings of all the methods that have been
proposed (except that I used C instead of Fortran) on one machine,
a 2.66GHz P4.  Here are the results, in seconds:

                    Python 2.3.4     Python 2.4a1

        naive loop: 0.657765         0.589741
  naive loop psyco: 0.159085
  naive loop weave: 0.084898  *
          numarray: 0.115775  **
         imap lots: 1.359815         0.994505
           imap fn: 0.979589         0.758308
        custom gen: 0.841540         0.681277
          gen expr:                  0.694567

      naive loop C: 0.040     *

Only one of the tests used psyco.  I did run the others with psyco
too, but the times were about the same or slower.

For me, psyco was about four times slower than C, and numarray was
almost 3 times slower.  I'm surprised by how close psyco and numarray
were in your runs, and how slow Fortran was in beliavsky's test.
My C program measures user+sys cpu time for the loop itself, but
not start-up time for the program.  In any case, getting within
a factor of four of C, with a random number generator that is probably
better, is pretty good!

One really should compare the random number generators more carefully,
since they could take a significant fraction of the time.  The lines
with one * use C's rand().  The line with ** uses numarray's random
array function.  Does anyone know what random number generator is used
by numarray?

By the way, note how well Python 2.4 performs compared with 2.3.4.
(Psyco, weave, numarray not shown because I don't have them for 2.4.)

I'm still curious about whether it could be possible to get 
really fast loops in Python using iterators and expressions like 
sum(1 + it1 - 2 * it2), where it1 and it2 are iterators that produce 
numbers.  Could Python be clever enough to implement that as a 
for loop in C with just two or three C function calls in the loop?

Dan

Here's the code I used:

-----pi.py-----
from random import random
from itertools import *
from math import sqrt
from operator import pow, sub, add

from timeTest import *

try:
    import weave
    haveWeave = True
except:
    haveWeave = False

try:
    import numarray
    import numarray.random_array
    haveNumarray = True
except:
    haveNumarray = False

def v1(n = 500000):
    "naive loop"
    rand = random
    sqr  = sqrt
    sm = 0
    for i in range(n):
        r = rand()
        sm += sqr(1-r*r)
    return 4*sm/n

def v1weave(n = 500000):
    "naive loop weave"

    support = "#include <stdlib.h>"

    code = """
    double sm;
    float rnd;
    srand(1); // seed random number generator
    sm = 0.0;
    for(int i=0;i<n;++i) {
        rnd = rand()/(RAND_MAX+1.0);
        sm += sqrt(1.0-rnd*rnd);
    }
    return_val =  4.0*sm/n;"""
    return weave.inline(code,('n'),support_code=support)

if(haveNumarray):
    def v2(n = 500000):
        "numarray"
        a = numarray.random_array.uniform(0, 1, n)
        return 4 * numarray.sum(numarray.sqrt(1 - a*a)) / n

def v3(n = 500000):
    "imap lots"
    rand = random
    pw   = pow
    sqr  = sqrt
    sq   = lambda x: x*x
    sm  = sum(imap(sqr,
                   imap(sub, repeat(1),
                        imap(sq,
                             imap(lambda dummy: rand(), repeat(None, n))))))
    return 4*sm/n

def v4(n = 500000):
    "imap fn"
    rand = random
    pw   = pow
    sqr  = sqrt

    def fn(dummy):
        r = rand()
        return sqr(1-r*r)

    sm  = sum(imap(fn, repeat(None, n)))
    return 4*sm/n

def custom(n):
    rand = random
    for i in xrange(n):
        yield(sqrt(1-rand()**2))

def v5(n = 500000):
    "custom gen"
    sm = sum(custom(n))
    return 4*sm/n

# The commented out psyco runs don't show significant improvement
# (or are slower).

if haveWeave:
    v1weave()

timeTest(v1)
timeTestWithPsyco(v1)
if haveWeave:
    timeTest(v1weave)
if haveNumarray:
    timeTest(v2)
    # timeTestWithPsyco(v2)
timeTest(v3)
#timeTestWithPsyco(v3)
timeTest(v4)
#timeTestWithPsyco(v4)
timeTest(v5)
#psyco.bind(custom)
#timeTestWithPsyco(v5)

-----pi2.py-----   (only for Python2.4)
from random import random
from math import sqrt

from timeTest import *

# Even if this is not executed, it is parsed, so Python 2.3 complains.

def v6(n = 500000):
    "gen expr"
    rand = random
    sqr  = sqrt
    sm = sum(sqr(1-rand()**2) for i in xrange(n))
    return 4*sm/n

timeTest(v6)

-----timeTest.py-----
import time

try:
    import psyco
    havePsyco = True
except:
    havePsyco = False

def timeTest(f):
    s = time.time()
    f()
    print "%18s: %f" % (f.__doc__, time.time()-s)

def timeTestWithPsyco(f):
    if havePsyco:
        g = psyco.proxy(f)
        g.__doc__ = f.__doc__ + " psyco"
        g()
        timeTest(g)

-----pi.c-----
#include <stdlib.h>

#include "/home/spin/C/spin_time.h"

int main()
{
  double sm;
  float rnd;
  int i;
  int n = 500000;
  spin_time start, end;

  spin_time_set(start);
  srand(1); // seed random number generator
  sm = 0.0;

  for(i=0;i<n;++i) {
    rnd = rand()/(RAND_MAX+1.0);
    sm += sqrt(1.0-rnd*rnd);
  }
  spin_time_set(end);
  printf("%f\n", 4.0*sm/n);
  printf("%18s: %.3f\n", "naive loop C", spin_time_both(start, end));
}

-----spin_time.h-----
#include <time.h>
#include <sys/times.h>
#include <stdio.h>

#define FACTOR (1.0/100.0)

typedef struct { 
  clock_t real; 
  struct tms t;
} spin_time;

#define spin_time_set(st) st.real = times(&(st.t))

// floating point number of seconds:
#define spin_time_both(st_start, st_end) \
	((st_end.t.tms_utime-st_start.t.tms_utime)*FACTOR + \
	 (st_end.t.tms_stime-st_start.t.tms_stime)*FACTOR)