[Numpy-discussion] OpenBLAS on Mac

Sturla Molden sturla.molden at gmail.com
Sat Feb 22 17:39:11 EST 2014


On 22/02/14 22:15, Nathaniel Smith wrote:

>> $ make TARGET=SANDYBRIDGE USE_OPENMP=0 BINARY=64 NOFORTRAN=1
>
> You'll definitely want to disable the affinity support too, and
> probably memory warmup. And possibly increase the maximum thread
> count, unless you'll only use the library on the computer it was built
> on. And maybe other things. The OpenBLAS build process has so many
> ways to accidentally impale yourself, it's an object lesson in why
> building regulations are a good thing.

Thanks for the advice.

Right now I am just testing on my own computer.

cblas_dgemm is running roughly 50 % faster with OpenBLAS than MKL 11.1 
update 2, sometimes OpenBLAS is twice as fast as MKL.

WTF???

:-D

Ok, next runner up is Accelerate. Let's see how it compares to OpenBLAS 
and MKL on Mavericks.


Sturla


-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include "mkl.h"

double nanodiff(const uint64_t _t0, const uint64_t _t1)
{   
    long double t0, t1, numer, denom, nanosec;
    mach_timebase_info_data_t tb_info;
    mach_timebase_info(&tb_info);
    numer = (long double)(tb_info.numer);
    denom = (long double)(tb_info.denom);    
    t0 = (long double)(_t0);
    t1 = (long double)(_t1);
    nanosec = (t1 - t0) * numer / denom;
    return (double)nanosec;
}

int main(int argc, char **argv)
{
    const int BOUNDARY = 64;
    long double nanosec;
    int n = 512;
    int m = n, k = n;
    double *A = (double*)mkl_malloc(n*n*sizeof(double), BOUNDARY); 
    double *B = (double*)mkl_malloc(n*n*sizeof(double), BOUNDARY); 
    double *C = (double*)mkl_malloc(n*n*sizeof(double), BOUNDARY);
    uint64_t t0, t1;
    
    t0 = mach_absolute_time();
    
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
           m, n, k, 1.0, A, k, B, n, 1.0, C, n);

    t1 = mach_absolute_time();
    
    nanosec = nanodiff(t0, t1);
    
    printf("elapsed time: %g ns\n", (double)nanosec);

    mkl_free(A); mkl_free(B); mkl_free(C);
}


-------------- next part --------------
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <cblas.h>

double nanodiff(const uint64_t _t0, const uint64_t _t1)
{   
    long double t0, t1, numer, denom, nanosec;
    mach_timebase_info_data_t tb_info;
    mach_timebase_info(&tb_info);
    numer = (long double)(tb_info.numer);
    denom = (long double)(tb_info.denom);    
    t0 = (long double)(_t0);
    t1 = (long double)(_t1);
    nanosec = (t1 - t0) * numer / denom;
    return (double)nanosec;
}

int main(int argc, char **argv)
{
    long double nanosec;
    int n = 512;
    int m = n, k = n;
    double *A = (double*)malloc(n*n*sizeof(double)); 
    double *B = (double*)malloc(n*n*sizeof(double)); 
    double *C = (double*)malloc(n*n*sizeof(double));
    uint64_t t0, t1;
    
    t0 = mach_absolute_time();
    
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
           m, n, k, 1.0, A, k, B, n, 1.0, C, n);

    t1 = mach_absolute_time();
    
    nanosec = nanodiff(t0, t1);
    
    printf("elapsed time: %g ns\n", (double)nanosec);

    free(A); free(B); free(C);
}




More information about the NumPy-Discussion mailing list