I have an uint32x4_t on which I want to perform a count of leading zeros (clz) and a table lookup withusing intrinsics.
Thus table lookup is only supported through uint8x8xN on ARM and the clz is only possible in the way I want it with uint32x4_t I ran into the following problem:
I load the structure from my input data (uint8_t*). Then I want to perform the clz by "casting" the uint8x8x2_t into an uint32x4_t. This cast ist just a combination of vector combine (vcombine) and two reinterpret_casts (vreinterpret).
As far as I understand the output of my little sample code, the call to {uint8x8x2}.val[0] fails somehow. That is why the output is far away from the expected result.
This is my code so far:
main.cpp:
#include <iostream>
#include <iomanip>
#include <arm_neon.h>
void printu8(uint8_t* data, uint8_t* tmpData, uint8_t* tmpDataHigh, uint8_t* tmpDataLow) {
std::cout << std::endl;
std::cout << "Load/Store Results:" << std::endl;
std::cout << " # Orig --> Struct --> High[0..7]/Low[8..15]" << std::endl;
for(int i = 0; i < 16; ++i) {
std::cout << "[" << std::setw(2) << i << "] "
<< std::setw(2) << (unsigned)data[i]
<< " --> "
<< std::setw(2) << (unsigned)tmpData[i]
<< " --> ";
if(i < 8) {
std::cout << std::setw(2) << (unsigned)tmpDataHigh[i];
}else{
std::cout << std::setw(2) << (unsigned)tmpDataLow[i];
}
std::cout<< std::endl;
}
}
void printClzResult(uint32_t* clzData) {
std::cout << "CLZ Results:" << std::endl;
std::cout << " # Expected --> Result" << std::endl;
std::cout << "[ 0] 16 --> "
<< std::setw(2) << (unsigned)clzData[0]
<< std::endl;
std::cout << "[ 1] 8 --> "
<< std::setw(2) << (unsigned)clzData[1]
<< std::endl;
std::cout << "[ 2] 0 --> "
<< std::setw(2) << (unsigned)clzData[2]
<< std::endl;
std::cout << "[ 3] 24 --> "
<< std::setw(2) << (unsigned)clzData[3]
<< std::endl;
}
uint32_t* clz(uint8x8x2_t vec) {
uint32x4_t clz =
vclzq_u32(
vcombine_u32(
vreinterpret_u32_u8(vec.val[0]),
vreinterpret_u32_u8(vec.val[1])
)
);
uint32_t* result = new uint32_t[4];
vst1q_u32(result, clz);
return result;
}
int main() {
uint8_t* data = new uint8_t[16];
uint8_t* tmpData = new uint8_t[16];
uint8_t* tmpDataHigh = new uint8_t[8];
uint8_t* tmpDataLow = new uint8_t[8];
uint32_t* clzData;
//init data
//uint32_t:
//[0] = 2570
//[1] = 65537
//[2] = 168430090
//[3] = 10
data[0] = 0x00;
data[1] = 0x00;
data[2] = 0x0A;
data[3] = 0x0A;
data[4] = 0x00;
data[5] = 0x0A;
data[6] = 0x00;
data[7] = 0x0A;
data[8] = 0x0A;
data[9] = 0x0A;
data[10] = 0x0A;
data[11] = 0x0A;
data[12] = 0x00;
data[13] = 0x00;
data[14] = 0x00;
data[15] = 0x0A;
//load structure
uint8x8x2_t dataVec = vld2_u8(data);
//store structure
vst2_u8(tmpData, dataVec);
//store high 8 byte
vst1_u8(tmpDataHigh, dataVec.val[0]);
//store low 8 byte
vst1_u8(tmpDataHigh, dataVec.val[1]);
printu8(data, tmpData, tmpDataHigh, tmpDataLow);
//count leading zeroes
clzData = clz(dataVec);
printClzResult(clzData);
//free everything
delete[] clzData;
delete[] tmpDataLow;
delete[] tmpDataHigh;
delete[] tmpData;
delete[] data;
return 0;
}
If one compile this code
g++ -march=native -mfpu=neon -std=c++14 main.cpp
and launch it, the following output is generated:
Load/Store Results:
# Orig --> Struct --> High[0..7]/Low[8..15]
[ 0] 0 --> 0 --> 0
[ 1] 0 --> 0 --> 10
[ 2] 10 --> 10 --> 10
[ 3] 10 --> 10 --> 10
[ 4] 0 --> 0 --> 10
[ 5] 10 --> 10 --> 10
[ 6] 0 --> 0 --> 0
[ 7] 10 --> 10 --> 10
[ 8] 10 --> 10 --> 0
[ 9] 10 --> 10 --> 0
[10] 10 --> 10 --> 0
[11] 10 --> 10 --> 0
[12] 0 --> 0 --> 9
[13] 0 --> 0 --> 4
[14] 0 --> 0 --> 0
[15] 10 --> 10 --> 0
CLZ Results:
# Expected --> Result
[ 0] 16 --> 20
[ 1] 8 --> 20
[ 2] 0 --> 4
[ 3] 24 --> 4
Is there something I missunderstood? Or do I something wrong? Any help or hints are appreciated.
Aucun commentaire:
Enregistrer un commentaire