dimanche 21 mars 2021

Significant performance discrepancy between two pieces of C++ code

I am having the trouble to understand why these two following routines do not have the expected difference in term of performance.

void matmul1(const float *matrix, const float *vector, float *output, uint32_t input_height, uint32_t input_width) {
    for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
            output[y] += matrix[y * input_width + x] * vector[x];
        }
    }
}

void matmul2(const float *matrix, const float *vector, float *output, uint32_t input_height, uint32_t input_width) {
    for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
            output[y] += *matrix++ * vector[x];
        }
    }
}

I repeat the executions of two functions on random data for 100 times on the same machines. The function matmul1 has a mean running time of 21298μs and the function matmul2 has a meaning running time of 24034μs. The standard deviation of the samples are 198 and 171.

Disassembly https://godbolt.org/z/of3zM4 gives me this (I am not fit enough with assembly to interpret the results properly)

matmul1(float const*, float const*, float*, unsigned int, unsigned int):
  mov w8, 0
  mov x7, 0
  cbz w3, .L1
.L2:
  cbz w4, .L5
  ldr s0, [x2, x7, lsl 2]
  mov x5, 0
.L6:
  add w6, w8, w5
  ldr s1, [x1, x5, lsl 2]
  add x5, x5, 1
  cmp w4, w5
  ldr s2, [x0, x6, lsl 2]
  fmadd s0, s2, s1, s0
  str s0, [x2, x7, lsl 2]
  bhi .L6
.L5:
  add x7, x7, 1
  add w8, w8, w4
  cmp w3, w7
  bhi .L2
.L1:
  ret
matmul2(float const*, float const*, float*, unsigned int, unsigned int):
  cbz w3, .L10
  sub w7, w4, #1
  mov x6, 0
  add x7, x7, 1
  lsl x7, x7, 2
.L15:
  cbz w4, .L12
  ldr s0, [x2, x6, lsl 2]
  mov x5, 0
.L13:
  ldr s2, [x0, x5, lsl 2]
  ldr s1, [x1, x5, lsl 2]
  add x5, x5, 1
  cmp w4, w5
  fmadd s0, s2, s1, s0
  str s0, [x2, x6, lsl 2]
  bhi .L13
  add x0, x0, x7
.L12:
  add x6, x6, 1
  cmp w3, w6
  bhi .L15
.L10:
  ret

I also ran the code on different optimization levels, different input sizes. Every time the first function would beat the second function. Why is that so? I expect the first function to be slower than the second, since it has one multiplication more in the inner loop, which is not the case.

I run the code on Raspberry Pi 4 with g++ 8.3.0

Aucun commentaire:

Enregistrer un commentaire