In the following code I am performing XOR operation on two arrays result and DB, result is accessed after an offset called rotate1 in following. As you can see I am already doing AVX2 and loop unrolling and also prefetching. I am wondering if I am missing anything that might be giving slow speed. In following else part of branch is accessed only once each time the function is called. I have noticed 50 percent time is spend on xor, rest 40 percent is spend on data store. Remaining on loads.
void perform_array_xor(uint32_t partindex, uint32_t offset, uint64_t *result, uint32_t EntrySize, uint32_t PartSize)
{
auto B = 1;
assert(EntrySize/8==B);
// Ensure that PartSize is a multiple of 32 for this example
if (PartSize % 8 != 0)
{
// Handle this case
return;
}
__m256i a,b,r;
unsigned int rotate1_1;
int k;
for (int i = 0; i < PartSize; i += 8)
{
rotate1_1 = (i + offset) & (PartSize - 1);
_mm_prefetch(result + rotate1_1, _MM_HINT_T2);
k = 0;
if(rotate1_1 + 7 < PartSize){
a = _mm256_loadu_si256((__m256i*)(result + rotate1_1));
b = _mm256_loadu_si256((__m256i*)(DB + partindex + i));
r = _mm256_xor_si256(a, b);
_mm256_storeu_si256((__m256i*)(result + rotate1_1), r);
//std::memcpy(result + rotate1_1, &r, sizeof(__m256i));
k = 4 ;
a = _mm256_loadu_si256((__m256i*)(result + rotate1_1 + k));
b = _mm256_loadu_si256((__m256i*)(DB + partindex + i + k));
r = _mm256_xor_si256(a, b);
_mm256_storeu_si256((__m256i*)(result + rotate1_1 + k), r);
//std::memcpy(result + rotate1_1 + k, &r, sizeof(__m256i));
}
else{
result[(rotate1_1 + 0) & (PartSize - 1)] ^= DB[partindex + (i + 0)];
result[(rotate1_1 + 1) & (PartSize - 1)] ^= DB[partindex + (i + 1)];
result[(rotate1_1 + 2) & (PartSize - 1)] ^= DB[partindex + (i + 2)];
result[(rotate1_1 + 3) & (PartSize - 1)] ^= DB[partindex + (i + 3)];
result[(rotate1_1 + 4) & (PartSize - 1)] ^= DB[partindex + (i + 4)];
result[(rotate1_1 + 5) & (PartSize - 1)] ^= DB[partindex + (i + 5)];
result[(rotate1_1 + 6) & (PartSize - 1)] ^= DB[partindex + (i + 6)];
result[(rotate1_1 + 7) & (PartSize - 1)] ^= DB[partindex + (i + 7)];
}
}
}
Aucun commentaire:
Enregistrer un commentaire