jeudi 26 novembre 2015

Speed up type convertion using intrinsics

I am working on an application which need to be convert datas to float. The datas can be unsigned char or unsigned short.

I am using both AVX2 and other SIMDs intrinsics in this code. I wrote the convertion like this :

unsigned char -> float :

#ifdef __AVX2__

    __m256i tmp_v =_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(src+j));


     v16_avx[0] = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(tmp_v,0x0));
     v16_avx[1] = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(tmp_v,0x1));

     v32_avx[0] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[0],0x0));
     v32_avx[1] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[0],0x1));
     v32_avx[2] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[1],0x0));
     v32_avx[3] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[1],0x1));




    for(int l=0;l<4;l++)
    {

                        __m256 vc1_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_lt_avx[l]));
                        __m256 vc2_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_ge_avx[l]));

/*
      .
      .
      .
      some processing there.
*/
    }

#endif

#ifdef __SSE2__

#ifdef __SSE3__
 __m128i tmp_v = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(src+j));
#else
 __m128i tmp_v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src+j));
#endif


#ifdef __SSE4_1__

   v16[0] = _mm_cvtepu8_epi16(tmp_v);
   tmp_v = _mm_shuffle_epi8(tmp_v,mask8);
   v16[1] = _mm_cvtepu8_epi16(tmp_v);

   v32[0] = _mm_cvtepi16_epi32(v16[0]);
   v16[0] = _mm_shuffle_epi32(v16[0],0x4E);
   v32[1] = _mm_cvtepi16_epi32(v16[0]);

   v32[2] = _mm_cvtepi16_epi32(v16[1]);
   v16[1] = _mm_shuffle_epi32(v16[1],0x4E);
   v32[3] = _mm_cvtepi16_epi32(v16[1]);

#else

   __m128i tmp_v_l = _mm_slli_si128(tmp_v,8);
   __m128i tmp_v_r = _mm_srli_si128(tmp_v,8);


   v16[0] = _mm_unpacklo_epi8(tmp_v,tmp_v_l);
   v16[1] = _mm_unpackhi_epi8(tmp_v,tmp_v_r);

   tmp_v_l = _mm_srli_epi16(v16[0],8);
   tmp_v_r = _mm_srai_epi16(v16[0],8);

   v32[0] = _mm_unpacklo_epi16(v16[0],tmp_v_l);
   v32[1] = _mm_unpackhi_epi16(v16[0],tmp_v_r);

   v16[0] = _mm_unpacklo_epi8(tmp_v,tmp_v_l);
   v16[1] = _mm_unpackhi_epi8(tmp_v,tmp_v_r);

   tmp_v_l = _mm_srli_epi16(v16[1],8);
   tmp_v_r = _mm_srai_epi16(v16[1],8);

   v32[2] = _mm_unpacklo_epi16(v16[1],tmp_v_l);
   v32[3] = _mm_unpackhi_epi16(v16[1],tmp_v_r);

#endif


   for(int l=0;l<4;l++)
      {

       __m128 vc1_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_lt[l]));
       __m128 vc2_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_ge[l]));

/*
      .
      .
      .
      some processing there.
*/
      }
#endif

unsigned short -> float

#ifdef __AVX2__

                v32_avx[0] = _mm256_cvtepu16_epi32(_mm256_extractf128_si256(tmp_v,0x0));
                v32_avx[1] = _mm256_cvtepu16_epi32(_mm256_extractf128_si256(tmp_v,0x1));


                for(int l=0;l<2;l++)
                {

                    __m256 vc1_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_lt_avx[l]));
                    __m256 vc2_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_ge_avx[l]));

    /*
          .
          .
          .
          some processing there.
    */

                 }
#endif

#ifdef __SSE2__

#ifdef __SSE3__
                __m128i tmp_v = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(src+j));
#else
                __m128i tmp_v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src+j));
#endif


#ifdef __SSE4_1__
                v32[0] = _mm_cvtepu16_epi32(tmp_v);
                tmp_v = _mm_shuffle_epi32(tmp_v,0x4E);
                v32[1] = _mm_cvtepu16_epi32(tmp_v);
#else
                __m128i tmp_v_l = _mm_slli_si128(tmp_v,8);
                __m128i tmp_v_r = _mm_srli_si128(tmp_v,8);

                v32[0] = _mm_unpacklo_epi16(tmp_v,tmp_v_l);
                v32[1] = _mm_unpackhi_epi16(tmp_v,tmp_v_r);
#endif


                for(int l=0;l<2;l++)
                {


                    __m128 vc1_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_lt[l]));
                    __m128 vc2_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_ge[l]));

    /*
          .
          .
          .
          some processing there.
    */

                 }

#endif

The processing in the comment have nothing to do with the convertion step.

I would like to speed up those convertion.

I read here :

SSE: convert short integer to float

and there :

Converting Int to Float/Float to Int using Bitwise

It's possible to do this using bitwise operations. Are those approach really faster ?

I experiment the implementation in the first link, there is almost no gain of processing time, it work fine for signed short and also for unsigned short as long as the value is included between 0 and MAX_SHRT (32767) :

#include <immintrin.h>
#include <iterator>
#include <iostream>
#include <chrono>

void convert_sse_intrinsic(const ushort *source,const int len, int *destination)
{
    __m128i zero2 =  _mm_setzero_si128();

    for (int i = 0; i < len; i+=4)
    {
    __m128i value = _mm_unpacklo_epi16(_mm_set_epi64x(0,*((long long*)(source+i)) /**ps*/), zero2);
    value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
    _mm_storeu_si128(reinterpret_cast<__m128i*>(destination+i),value);
    }
}

void convert_sse_intrinsic2(const ushort *source,const int len, int *destination)
{

    for (int i = 0; i < len; i+=8)
    {

        __m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(source+i));

        _mm_storeu_si128(reinterpret_cast<__m128i*>(destination+i),_mm_cvtepu16_epi32(value));

        value = _mm_shuffle_epi32(value,0x4E);

        _mm_storeu_si128(reinterpret_cast<__m128i*>(destination+i+4),_mm_cvtepu16_epi32(value));
    }
}


int main(int argc, char *argv[])
{

    ushort CV_DECL_ALIGNED(32) toto[16] =
                        {0,500,1000,5000,
                       10000,15000,20000,25000,
                       30000,35000,40000,45000,
                       50000,55000,60000,65000};

    int CV_DECL_ALIGNED(32) tutu[16] = {0};

    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
    convert_sse_intrinsic(toto,16,tutu);
    std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now();

    std::cout<<"processing time 1st method : "<<std::chrono::duration_cast<std::chrono::nanoseconds>(stop-start).count()<<" : ns"<<std::endl;

    std::copy(tutu,tutu+16,std::ostream_iterator<int>(std::cout," "));
    std::cout<<std::endl;


    start = std::chrono::steady_clock::now();
    convert_sse_intrinsic2(toto,16,tutu);
    stop = std::chrono::steady_clock::now();

    std::cout<<"processing time 2nd method : "<<std::chrono::duration_cast<std::chrono::nanoseconds>(stop-start).count()<<" : ns"<<std::endl;

    std::copy(tutu,tutu+16,std::ostream_iterator<int>(std::cout," "));
    std::cout<<std::endl;


  return 0;
}

Thank's in advance for any help.

Aucun commentaire:

Enregistrer un commentaire