I am benchmarking matrix multiplication for different libraries as I am thinking of rewriting some cython code to native c++. However the simple tests seem to imply that numpy is faster than BLAS or eigen for simple matrix multiplications.
I have written the following files:
#!test_blas.cpp
#include <random>
#include <cstdio>
#include <stdlib.h>
#include <iostream>
#include <cblas.h>
int main ( int argc, char* argv[] ) {
// Random numbers
std::mt19937_64 rnd;
std::uniform_real_distribution<double> doubleDist(0, 1);
// Create arrays that represent the matrices A,B,C
const int n = 2000;
double* A = new double[n*n];
double* B = new double[n*n];
double* C = new double[n*n];
// Fill A and B with random numbers
for(uint i =0; i <n; i++){
for(uint j=0; j<n; j++){
A[i*n+j] = doubleDist(rnd);
B[i*n+j] = doubleDist(rnd);
}
}
// Calculate A*B=C
clock_t start = clock();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, A, n, B, n, 0.0, C, n);
clock_t end = clock();
double time = double(end - start)/ CLOCKS_PER_SEC;
std::cout<< "Time taken : " << time << std::endl;
// Clean up
delete[] A;
delete[] B;
delete[] C;
return 0;
}
#!test_eigen.cpp
#include <iostream>
#include <Eigen/Dense>
using namespace Eigen;
int main()
{
int n_a_rows = 2000;
int n_a_cols = 2000;
int n_b_rows = n_a_cols;
int n_b_cols = 2000;
MatrixXd a(n_a_rows, n_a_cols);
for (int i = 0; i < n_a_rows; ++ i)
for (int j = 0; j < n_a_cols; ++ j)
a (i, j) = n_a_cols * i + j;
MatrixXd b (n_b_rows, n_b_cols);
for (int i = 0; i < n_b_rows; ++ i)
for (int j = 0; j < n_b_cols; ++ j)
b (i, j) = n_b_cols * i + j;
MatrixXd d (n_a_rows, n_b_cols);
clock_t begin = clock();
d = a * b;
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
std::cout << "Time taken : " << elapsed_secs << std::endl;
}
#!test_numpy.py
import numpy as np
import time
N = 2000
a = np.random.rand(N, N)
b = np.random.rand(N, N)
start = time.time()
c = a.dot(b)
print(f"Time taken : {time.time() - start}")
Finally I created the following test file
#!test_matrix.sh
c++ -O2 -march=native -std=c++11 -I /usr/include/eigen3 test_eigen.cpp -o eigen
c++ -O2 -march=native -std=c++11 test_blas.cpp -o blas -lcblas
echo "testing BLAS"
./blas
echo "testing Eigen"
./eigen
echo "testing numpy"
python test_numpy.py
which yields the output
testing BLAS
Time taken : 1.63807
testing Eigen
Time taken : 0.795115
testing numpy
Time taken : 0.28397703170776367
Now my question is, how come numpy
is the fastest of these tests? Am I missing something with regards to optimizations?
One thing could be that numpy uses threading to compute the matrix product. Adding the compiler flag -fopenmp
however yields worse performance for eigen
and BLAS
.
I am using g++ version 9.0.3-1. Numpy is version 1.18.1 using python 3.8.2. Thanks in advance.
Aucun commentaire:
Enregistrer un commentaire