c++11: SIMD Structure processing

jeudi 23 février 2017

SIMD Structure processing

I have an uint32x4_t on which I want to perform a count of leading zeros (clz) and a table lookup withusing intrinsics.

Thus table lookup is only supported through uint8x8xN on ARM and the clz is only possible in the way I want it with uint32x4_t I ran into the following problem:

I load the structure from my input data (uint8_t*). Then I want to perform the clz by "casting" the uint8x8x2_t into an uint32x4_t. This cast ist just a combination of vector combine (vcombine) and two reinterpret_casts (vreinterpret).

As far as I understand the output of my little sample code, the call to {uint8x8x2}.val[0] fails somehow. That is why the output is far away from the expected result.

This is my code so far:

main.cpp:

#include <iostream>
#include <iomanip>
#include <arm_neon.h>

void printu8(uint8_t* data, uint8_t* tmpData, uint8_t* tmpDataHigh, uint8_t* tmpDataLow) {
   std::cout << std::endl;
   std::cout << "Load/Store Results:" << std::endl;
   std::cout << "  #   Orig  --> Struct --> High[0..7]/Low[8..15]" << std::endl;
   for(int i = 0; i < 16; ++i) {
      std::cout << "[" << std::setw(2) << i << "]   "
                << std::setw(2) << (unsigned)data[i]
                << "    -->   "
                << std::setw(2) << (unsigned)tmpData[i]
                << "  -->   ";
      if(i < 8) {
         std::cout << std::setw(2) << (unsigned)tmpDataHigh[i];
      }else{
         std::cout << std::setw(2) << (unsigned)tmpDataLow[i];
      }
      std::cout<< std::endl;
   }
}

void printClzResult(uint32_t* clzData) {
   std::cout << "CLZ Results:" << std::endl;
   std::cout << "  #  Expected --> Result" << std::endl;
   std::cout << "[ 0]    16    --> "
             << std::setw(2) << (unsigned)clzData[0]
             << std::endl;
   std::cout << "[ 1]     8    --> "
             << std::setw(2) << (unsigned)clzData[1]
             << std::endl;
   std::cout << "[ 2]     0    --> "
             << std::setw(2) << (unsigned)clzData[2]
             << std::endl;
   std::cout << "[ 3]    24    --> "
             << std::setw(2) << (unsigned)clzData[3]
             << std::endl;
}

uint32_t* clz(uint8x8x2_t vec) {

   uint32x4_t clz =
      vclzq_u32(
         vcombine_u32(
            vreinterpret_u32_u8(vec.val[0]),
            vreinterpret_u32_u8(vec.val[1])
         )
      );
   uint32_t* result = new uint32_t[4];
   vst1q_u32(result, clz);
   return result;
}

int main() {
   uint8_t* data = new uint8_t[16];
   uint8_t* tmpData = new uint8_t[16];
   uint8_t* tmpDataHigh = new uint8_t[8];
   uint8_t* tmpDataLow = new uint8_t[8];
   uint32_t* clzData;

   //init data
   //uint32_t:
   //[0] = 2570
   //[1] = 65537
   //[2] = 168430090
   //[3] = 10
   data[0]  = 0x00;
   data[1]  = 0x00;
   data[2]  = 0x0A;
   data[3]  = 0x0A;
   data[4]  = 0x00;
   data[5]  = 0x0A;
   data[6]  = 0x00;
   data[7]  = 0x0A;
   data[8]  = 0x0A;
   data[9]  = 0x0A;
   data[10] = 0x0A;
   data[11] = 0x0A;
   data[12] = 0x00;
   data[13] = 0x00;
   data[14] = 0x00;
   data[15] = 0x0A;

   //load structure
   uint8x8x2_t dataVec = vld2_u8(data);
   //store structure
   vst2_u8(tmpData, dataVec);
   //store high 8 byte
   vst1_u8(tmpDataHigh, dataVec.val[0]);
   //store low 8 byte
   vst1_u8(tmpDataHigh, dataVec.val[1]);
      printu8(data, tmpData, tmpDataHigh, tmpDataLow);
   //count leading zeroes 
   clzData = clz(dataVec);
      printClzResult(clzData);
   //free everything
   delete[] clzData;
   delete[] tmpDataLow;
   delete[] tmpDataHigh;
   delete[] tmpData;
   delete[] data;

   return 0;
}

If one compile this code

g++ -march=native -mfpu=neon -std=c++14 main.cpp

and launch it, the following output is generated:

Load/Store Results:
  #   Orig  --> Struct --> High[0..7]/Low[8..15]
[ 0]    0    -->    0  -->    0
[ 1]    0    -->    0  -->   10
[ 2]   10    -->   10  -->   10
[ 3]   10    -->   10  -->   10
[ 4]    0    -->    0  -->   10
[ 5]   10    -->   10  -->   10
[ 6]    0    -->    0  -->    0
[ 7]   10    -->   10  -->   10
[ 8]   10    -->   10  -->    0
[ 9]   10    -->   10  -->    0
[10]   10    -->   10  -->    0
[11]   10    -->   10  -->    0
[12]    0    -->    0  -->    9
[13]    0    -->    0  -->    4
[14]    0    -->    0  -->    0
[15]   10    -->   10  -->    0
CLZ Results:
  #  Expected --> Result
[ 0]    16    --> 20
[ 1]     8    --> 20
[ 2]     0    -->  4
[ 3]    24    -->  4

Is there something I missunderstood? Or do I something wrong? Any help or hints are appreciated.

c++11

jeudi 23 février 2017

SIMD Structure processing

Aucun commentaire:

Enregistrer un commentaire