Real-world Python code 700 times slower than C

Sun Jan 6 03:02:00 EST 2002

Hey Brent,

> I like the approach that the Perl Inline module takes where you can
> put C code directly inline with your Perl code and the Inline module
> compiles and caches the C code automatically.  However the fact that
> it's C (with all of its safety and portability problems) and the fact
> that it relies on a C compiler to be properly installed and accessible
> make this approach unappealing for general use.

weave is a new package from the SciPy project that does this sort of thing.
It isn't ready for release quite yet, but is pretty close.  Comparing
weave.inline() using Numeric arrays instead of lists to timings for straight
C, I get:

    N_array = 10000 loops = 10000
    compiled numeric2 (seconds): 0.912000060081
    c version (seconds): 0.480000

So your 1.9 times slower using Python+inline() for this case.  That's not so
bad for this problem.    And, for larger arrays, the descrepency gets even
smaller:

    N_array = 100000 loops = 1000
    compiled numeric2 (seconds): 3.97500002384
    c version (seconds): 3.795000

Your comment about portabilitiy is an issue. To solve this, weave also
allows you to build extension modules for distribution instead of compiling
on the fly.  Unfortunately, I found that using this approach was about 3
times slower on this problem than calling inline(). : |  Don't quite get
this -- and that's why it ain't released yet.  If your interested in taking
a peek though (and testing!!!), as Fernando mentioned, the web site is here:

    http://www.scipy.org/site_content/weave

Hopefully I'll get this thing out the door sometime next week.

see ya,
eric

ps:
I also changed the Ramp() C code a tad bit to get a little more performance
(both in the inline() and in the C version so the comparison is fair).

------------------------------

import time
import weave
from Numeric import *

def Ramp(result, size, start, end):
    step = (end-start)/(size-1)
    for i in xrange(size):
        result[i] = start + step*i

def Ramp_numeric1(result,size,start,end):
    code = """
           double step = (end-start)/(size-1);
           double val = start;
           for (int i = 0; i < size; i++)
               *result_data++ = start + step*i;
           """
    weave.inline(code,['result','size','start','end'],compiler='gcc')

def Ramp_numeric2(result,size,start,end):
    code = """
           double step = (end-start)/(size-1);
           double val = start;
           for (int i = 0; i < size; i++)
           {
              result_data[i] = val;
              val += step;
           }
           """
    weave.inline(code,['result','size','start','end'],compiler='gcc')

def main():
    N_py = 200
    N_c = 10000
    ratio = float(N_c) / N_py

    arr = [0]*10000
    t1 = time.time()
    for i in xrange(N_py):
        Ramp(arr, 10000, 0.0, 1.0)
    t2 = time.time()
    py_time = (t2 - t1) * ratio
    print 'python (seconds*ratio):', py_time
    print 'arr[500]:', arr[500]

    arr = array([0]*10000,Float64)
    # First call compiles function or loads from cache.
    # I'm not including this in the timing.
    Ramp_numeric1(arr, 10000, 0.0, 1.0)
    t1 = time.time()
    for i in xrange(N_c):
        Ramp_numeric1(arr, 10000, 0.0, 1.0)
    t2 = time.time()
    c_time = (t2 - t1)
    print 'compiled numeric1 (seconds, speed up):', c_time, py_time/ c_time
    print 'arr[500]:', arr[500]

    arr2 = array([0]*10000,Float64)
    # First call compiles function or loads from cache.
    # I'm not including this in the timing.
    Ramp_numeric2(arr, 10000, 0.0, 1.0)
    t1 = time.time()
    for i in xrange(N_c):
        Ramp_numeric2(arr, 10000, 0.0, 1.0)
    t2 = time.time()
    c_time = (t2 - t1)
    print 'compiled numeric2 (seconds, speed up):', c_time, py_time/ c_time
    print 'arr[500]:', arr[500]

if __name__ == '__main__':
    main()
--------------------------------
C:\home\ej\wrk\weave\examples>python ramp.py
python (seconds*ratio): 132.20000267
arr[500]: 0.0500050005001
compiled numeric1 (seconds, speed up): 1.44200003147 91.6782245389
arr[500]: 0.0500050005001
compiled numeric2 (seconds, speed up): 0.912000060081 144.956133729
arr[500]: 0.0500050005001

--------------------------------
And the C version:

#include <time.h>
#include <stdio.h>

void Ramp(double* result, int size, double start, double end)
{
    double step = (end-start)/(size-1);
    double val = start;
    int i;
    for (i = 0; i < size; i++)
    {
        *result++ = val;
        val += step;
    }
}

void main()
{
    double array[10000];
    int i;
    clock_t t1, t2;
    float seconds;
    t1 = clock();
    for (i = 0; i < 10000; i++)
        Ramp(array, 10000, 0.0, 1.0);
    t2 = clock();
    seconds = (float)(t2-t1)/CLOCKS_PER_SEC;
    printf("c version (seconds): %f\n", seconds);
    printf("array[500]: %f\n", array[500]);
}
--------------------------------
C:\home\ej\wrk\weave\examples>ramp.exe
c version (seconds): 0.480000
array[500]: 0.050005