I am having the trouble to understand why these two following routines do not have the expected difference in term of performance.
void matmul1(const float *matrix, const float *vector, float *output, uint32_t input_height, uint32_t input_width) {
for (uint32_t y = 0; y < input_height; y++) {
for (uint32_t x = 0; x < input_width; x++) {
output[y] += matrix[y * input_width + x] * vector[x];
}
}
}
void matmul2(const float *matrix, const float *vector, float *output, uint32_t input_height, uint32_t input_width) {
for (uint32_t y = 0; y < input_height; y++) {
for (uint32_t x = 0; x < input_width; x++) {
output[y] += *matrix++ * vector[x];
}
}
}
I repeat the executions of two functions on random data for 100 times on the same machines. The function matmul1
has a mean running time of 21298μs
and the function matmul2
has a meaning running time of 24034μs
. The standard deviation of the samples are 198
and 171
.
Disassembly https://godbolt.org/z/of3zM4 gives me this (I am not fit enough with assembly to interpret the results properly)
matmul1(float const*, float const*, float*, unsigned int, unsigned int):
mov w8, 0
mov x7, 0
cbz w3, .L1
.L2:
cbz w4, .L5
ldr s0, [x2, x7, lsl 2]
mov x5, 0
.L6:
add w6, w8, w5
ldr s1, [x1, x5, lsl 2]
add x5, x5, 1
cmp w4, w5
ldr s2, [x0, x6, lsl 2]
fmadd s0, s2, s1, s0
str s0, [x2, x7, lsl 2]
bhi .L6
.L5:
add x7, x7, 1
add w8, w8, w4
cmp w3, w7
bhi .L2
.L1:
ret
matmul2(float const*, float const*, float*, unsigned int, unsigned int):
cbz w3, .L10
sub w7, w4, #1
mov x6, 0
add x7, x7, 1
lsl x7, x7, 2
.L15:
cbz w4, .L12
ldr s0, [x2, x6, lsl 2]
mov x5, 0
.L13:
ldr s2, [x0, x5, lsl 2]
ldr s1, [x1, x5, lsl 2]
add x5, x5, 1
cmp w4, w5
fmadd s0, s2, s1, s0
str s0, [x2, x6, lsl 2]
bhi .L13
add x0, x0, x7
.L12:
add x6, x6, 1
cmp w3, w6
bhi .L15
.L10:
ret
I also ran the code on different optimization levels, different input sizes. Every time the first function would beat the second function. Why is that so? I expect the first function to be slower than the second, since it has one multiplication more in the inner loop, which is not the case.
I run the code on Raspberry Pi 4 with g++ 8.3.0
Aucun commentaire:
Enregistrer un commentaire