lundi 18 septembre 2023

How can I further optimize this code regarding array operations?

In the following code I am performing XOR operation on two arrays result and DB, result is accessed after an offset called rotate1 in following. As you can see I am already doing AVX2 and loop unrolling and also prefetching. I am wondering if I am missing anything that might be giving slow speed. In following else part of branch is accessed only once each time the function is called. I have noticed 50 percent time is spend on xor, rest 40 percent is spend on data store. Remaining on loads.

void perform_array_xor(uint32_t partindex, uint32_t offset, uint64_t *result, uint32_t EntrySize, uint32_t PartSize)
{
    auto B = 1;

    assert(EntrySize/8==B);

    // Ensure that PartSize is a multiple of 32 for this example
    if (PartSize % 8 != 0)
    {
        // Handle this case
        return;
    }

     __m256i a,b,r;

     unsigned int rotate1_1;
     int k;
    
    for (int i = 0; i < PartSize; i += 8)
    {
        rotate1_1 = (i + offset) & (PartSize - 1);


        _mm_prefetch(result + rotate1_1, _MM_HINT_T2);
        k = 0;
        if(rotate1_1 + 7 < PartSize){
            a = _mm256_loadu_si256((__m256i*)(result + rotate1_1));
            b = _mm256_loadu_si256((__m256i*)(DB + partindex + i));
            r = _mm256_xor_si256(a, b);
            _mm256_storeu_si256((__m256i*)(result + rotate1_1), r);
            //std::memcpy(result + rotate1_1, &r, sizeof(__m256i));
            
            k = 4 ;
            a = _mm256_loadu_si256((__m256i*)(result + rotate1_1 + k));
            b = _mm256_loadu_si256((__m256i*)(DB + partindex + i + k));
            r = _mm256_xor_si256(a, b);
            _mm256_storeu_si256((__m256i*)(result + rotate1_1 + k), r);
            //std::memcpy(result + rotate1_1 + k, &r, sizeof(__m256i));
            
        }
        else{
        result[(rotate1_1 + 0) & (PartSize - 1)] ^= DB[partindex + (i + 0)];
        result[(rotate1_1 + 1) & (PartSize - 1)] ^= DB[partindex + (i + 1)];
        result[(rotate1_1 + 2) & (PartSize - 1)] ^= DB[partindex + (i + 2)];
        result[(rotate1_1 + 3) & (PartSize - 1)] ^= DB[partindex + (i + 3)];
        result[(rotate1_1 + 4) & (PartSize - 1)] ^= DB[partindex + (i + 4)];
        result[(rotate1_1 + 5) & (PartSize - 1)] ^= DB[partindex + (i + 5)];
        result[(rotate1_1 + 6) & (PartSize - 1)] ^= DB[partindex + (i + 6)];
        result[(rotate1_1 + 7) & (PartSize - 1)] ^= DB[partindex + (i + 7)];
        }
        
    }
}

Aucun commentaire:

Enregistrer un commentaire