I am new to the field of SSE2 and AVX. I write the following code to test the performance of both SSE2 and AVX.
#include <cmath>
#include <iostream>
#include <chrono>
#include <emmintrin.h>
#include <immintrin.h>
void normal_res(float* __restrict__ a, float* __restrict__ b, float* __restrict__ c, unsigned long N) {
for (unsigned long n = 0; n < N; n++) {
c[n] = sqrt(a[n]) + sqrt(b[n]);
}
}
void normal(float* a, float* b, float* c, unsigned long N) {
for (unsigned long n = 0; n < N; n++) {
c[n] = sqrt(a[n]) + sqrt(b[n]);
}
}
void sse(float* a, float* b, float* c, unsigned long N) {
__m128* a_ptr = (__m128*)a;
__m128* b_ptr = (__m128*)b;
for (unsigned long n = 0; n < N; n+=4, a_ptr++, b_ptr++) {
__m128 asqrt = _mm_sqrt_ps(*a_ptr);
__m128 bsqrt = _mm_sqrt_ps(*b_ptr);
__m128 add_result = _mm_add_ps(asqrt, bsqrt);
_mm_store_ps(&c[n], add_result);
}
}
void avx(float* a, float* b, float* c, unsigned long N) {
__m256* a_ptr = (__m256*)a;
__m256* b_ptr = (__m256*)b;
for (unsigned long n = 0; n < N; n+=8, a_ptr++, b_ptr++) {
__m256 asqrt = _mm256_sqrt_ps(*a_ptr);
__m256 bsqrt = _mm256_sqrt_ps(*b_ptr);
__m256 add_result = _mm256_add_ps(asqrt, bsqrt);
_mm256_store_ps(&c[n], add_result);
}
}
int main(int argc, char** argv) {
unsigned long N = 1 << 30;
auto *a = static_cast<float*>(aligned_alloc(128, N*sizeof(float)));
auto *b = static_cast<float*>(aligned_alloc(128, N*sizeof(float)));
auto *c = static_cast<float*>(aligned_alloc(128, N*sizeof(float)));
std::chrono::time_point<std::chrono::system_clock> start, end;
for (unsigned long i = 0; i < N; ++i) {
a[i] = 3141592.65358;
b[i] = 1234567.65358;
}
start = std::chrono::system_clock::now();
for (int i = 0; i < 5; i++)
normal(a, b, c, N);
end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
std::cout << "normal elapsed time: " << elapsed_seconds.count() / 5 << std::endl;
start = std::chrono::system_clock::now();
for (int i = 0; i < 5; i++)
normal_res(a, b, c, N);
end = std::chrono::system_clock::now();
elapsed_seconds = end - start;
std::cout << "normal restrict elapsed time: " << elapsed_seconds.count() / 5 << std::endl;
start = std::chrono::system_clock::now();
for (int i = 0; i < 5; i++)
sse(a, b, c, N);
end = std::chrono::system_clock::now();
elapsed_seconds = end - start;
std::cout << "sse elapsed time: " << elapsed_seconds.count() / 5 << std::endl;
start = std::chrono::system_clock::now();
for (int i = 0; i < 5; i++)
avx(a, b, c, N);
end = std::chrono::system_clock::now();
elapsed_seconds = end - start;
std::cout << "avx elapsed time: " << elapsed_seconds.count() / 5 << std::endl;
return 0;
}
I compile my program by using g++ complier as the following.
g++ -msse -msse2 -mavx -mavx512f -O2
The results are as the following. It seems that there is no further improvement when I use more advanced 256 bits vectors.
normal elapsed time: 10.5311
normal restrict elapsed time: 8.00338
sse elapsed time: 0.995806
avx elapsed time: 0.973302
I have two questions.
- Why does not AVX give me further improvement? Is it because the memory bandwidth?
- According to my experiment, the SSE2 perform 10 times faster than the naive version. Why is that? I expect the SSE2 can only be 4 times faster based on its 128 bits vectors with respect to single precision floating points. Thanks a lot.
Aucun commentaire:
Enregistrer un commentaire