c++11: SIMD vs OMP in vector multiplication

mercredi 25 mai 2016

SIMD vs OMP in vector multiplication

In my project I have to do several vector multiplications, done on either double *a-vectors or float *a-vectors. In order to accelerate that, I wanted to use either SIMD-operations or omp. For getting the fastest result, I wrote a benchmark program:

#include <iostream>
#include <memory>
#include <vector>
#include <omp.h>
#include <immintrin.h>
#include <stdlib.h>
#include <chrono>


#define SIZE 32768
#define ROUNDS 1e5

void multiply_singular(float *a, float *b, float *d)
{
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_omp(float *a, float *b, float *d)
{
#pragma omp parallel for
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_avx(float *a, float *b, float *d)
{
    __m256 a_a, b_a, c_a;
    for(int i = 0; i < SIZE/8; i++)
    {
        a_a = _mm256_loadu_ps(a+8*i);
        b_a = _mm256_loadu_ps(b+8*i);
        c_a = _mm256_mul_ps(a_a, b_a);
        _mm256_storeu_ps (d+i*8, c_a);
    }
}

void multiply_avx_omp(float *a, float *b, float *d)
{
    __m256 a_a, b_a, c_a;
#pragma omp for
    for(int i = 0; i < SIZE/8; i++)
    {
        a_a = _mm256_loadu_ps(a+8*i);
        b_a = _mm256_loadu_ps(b+8*i);
        c_a = _mm256_mul_ps(a_a, b_a);
        _mm256_storeu_ps (d+i*8, c_a);
    }
}

void multiply_singular_double(double *a, double *b, double *d)
{
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_omp_double(double *a, double *b, double *d)
{
#pragma omp parallel for
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_avx_double(double *a, double *b, double *d)
{
    __m256d a_a, b_a, c_a;
    for(int i = 0; i < SIZE/4; i++)
    {
        a_a = _mm256_loadu_pd(a+4*i);
        b_a = _mm256_loadu_pd(b+4*i);
        c_a = _mm256_mul_pd(a_a, b_a);
        _mm256_storeu_pd (d+i*4, c_a);
    }
}

void multiply_avx_double_omp(double *a, double *b, double *d)
{
    __m256d a_a, b_a, c_a;
#pragma omp parallel for
    for(int i = 0; i < SIZE/4; i++)
    {
        a_a = _mm256_loadu_pd(a+4*i);
        b_a = _mm256_loadu_pd(b+4*i);
        c_a = _mm256_mul_pd(a_a, b_a);
        _mm256_storeu_pd (d+i*4, c_a);
    }
}


int main()
{
    float *a, *b, *c, *d, *e, *f;
    double *a_d, *b_d, *c_d, *d_d, *e_d, *f_d;
    a = new float[SIZE] {0};
    b = new float[SIZE] {0};
    c = new float[SIZE] {0};
    d = new float[SIZE] {0};
    e = new float[SIZE] {0};
    f = new float[SIZE] {0};
    a_d = new double[SIZE] {0};
    b_d = new double[SIZE] {0};
    c_d = new double[SIZE] {0};
    d_d = new double[SIZE] {0};
    e_d = new double[SIZE] {0};
    f_d = new double[SIZE] {0};
    for(int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        a_d[i] = i;
        b_d[i] = i;
    };
    std::cout << "Now doing the single float rounds!\n";
    std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        multiply_singular(a, b, c);
    }
    std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
    auto duration_ss = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the omp float rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_omp(a, b, d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_so = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx float rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx(a, b, e);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_sa = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx omp float rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx_omp(a, b, e);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_sao = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the single double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        multiply_singular_double(a_d, b_d, c_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_ds = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the omp double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_omp_double(a_d, b_d, d_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_do = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx_double(a_d, b_d, e_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_da = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx omp double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx_double_omp(a_d, b_d, f_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_dao = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Finished\n";
    std::cout << "Elapsed time for functions:\n";
    std::cout << "Function\ttime[ms]\n";
    std::cout << "Singular float:\t" << duration_ss/ROUNDS << '\n';
    std::cout << "OMP float:\t" << duration_so/(ROUNDS*10) << '\n';
    std::cout << "AVX float avx:\t" << duration_sa/(ROUNDS*10) << '\n';
    std::cout << "OMP AVX float avx omp:\t" << duration_sao/(ROUNDS*10) << '\n';
    std::cout << "Singular double:\t" << duration_ds/ROUNDS << '\n';
    std::cout << "OMP double:\t" << duration_do/(ROUNDS*10) << '\n';
    std::cout << "AVX double:\t" << duration_da/(ROUNDS*10) << '\n';
    std::cout << "OMP AVX double:\t" << duration_dao/(ROUNDS*10) << '\n';
    delete[] a;
    delete[] b;
    delete[] c;
    delete[] d;
    delete[] e;
    delete[] f;
    delete[] a_d;
    delete[] b_d;
    delete[] c_d;
    delete[] d_d;
    delete[] e_d;
    delete[] f_d;
    return 0;
}

When compiling it with g++-5 -fopenmp -std=c++14 -march=native test_new.cpp -o test -lgomp, I get

Elapsed time for functions:
Function    time[ms]
Singular float: 117.979
OMP float:  40.5385
AVX float avx:  60.2964
OMP AVX float avx omp:  61.4206
Singular double:    129.59
OMP double: 200.745
AVX double: 136.715
OMP AVX double: 122.176

or in a second run

Elapsed time for functions:
Function    time[ms]
Singular float: 113.932
OMP float:  39.2581
AVX float avx:  58.3029
OMP AVX float avx omp:  60.0023
Singular double:    123.575
OMP double: 66.0327
AVX double: 124.293
OMP AVX double: 318.038

Here obviously the pure omp-function is faster than the other functions, even as the AVX function. When adding the -O3-switch to the compiling line, I get the following result:

Elapsed time for functions:
Function    time[ms]
Singular float: 12.7361
OMP float:  4.82436
AVX float avx:  14.7514
OMP AVX float avx omp:  14.7225
Singular double:    27.9976
OMP double: 8.50957
AVX double: 32.5175
OMP AVX double: 257.219

Here again omp is significantly faster than everything else, while AVX is slowest, even slower than the linear approach. Why is that? Is my AVX function implementation just crappy, or are there other problems?

Executed on Ubuntu 14.04.1, i7 Sandy Bridge, gcc version 5.3.0.

Edit: I found one mistake: I should move the declarations of the temporary variables in the avx-functions inside the for-loop, that gets me nearly to the omp-level (and delivers correct results).

Edit 2: When disabling the -O3-switch, the OMP-AVX-instructions are faster than the OMP-functions, with the switch they are nearly on par.

c++11

mercredi 25 mai 2016

SIMD vs OMP in vector multiplication

Aucun commentaire:

Enregistrer un commentaire