This is a very reduced test-case for something I've been having trouble with, where using a numerical wrapper class (which I thought would be inlined away) made my program 10x slower.
This is independent of optimisation level.
Am I missing some detail in my wrapper class?
C++
I have the following program, in which I define a class which wraps a double
and provides the +
operator:
#include <cstdio>
#include <cstdlib>
#define INLINE __attribute__((always_inline)) inline
struct alignas(8) WrappedDouble {
double value;
INLINE friend const WrappedDouble operator+(const WrappedDouble& left, const WrappedDouble& right) {
return {left.value + right.value};
};
};
#define doubleType WrappedDouble // either "double" or "WrappedDouble"
int main() {
int N = 100000000;
doubleType* arr = (doubleType*)malloc(sizeof(doubleType)*N);
for (int i = 1; i < N; i++) {
arr[i] = arr[i - 1] + arr[i];
}
free(arr);
printf("done\n");
return 0;
}
I thought that this would compile to the same thing - it's doing the same calculations, and everything is inlined.
However, it's not - it produces a larger and slower result, regardless of optimisation level.
(This particular result is not significantly slower, but my actual use-case includes more arithmetic.)
Assembly
I'm using XCode's gcc on my Mac, on a 64-bit system. The result is the same, aside from the body of the for-loop.
Here's what it produces for the body of the loop if doubleType
is double
:
movq -16(%rbp), %rax
movl -20(%rbp), %ecx
subl $1, %ecx
movslq %ecx, %rdx
movsd (%rax,%rdx,8), %xmm0 ## xmm0 = mem[0],zero
movq -16(%rbp), %rax
movslq -20(%rbp), %rdx
addsd (%rax,%rdx,8), %xmm0
movq -16(%rbp), %rax
movslq -20(%rbp), %rdx
movsd %xmm0, (%rax,%rdx,8)
The WrappedDouble
version is much longer:
movq -40(%rbp), %rax
movl -44(%rbp), %ecx
subl $1, %ecx
movslq %ecx, %rdx
shlq $3, %rdx
addq %rdx, %rax
movq -40(%rbp), %rdx
movslq -44(%rbp), %rsi
shlq $3, %rsi
addq %rsi, %rdx
movq %rax, -16(%rbp)
movq %rdx, -24(%rbp)
movq -16(%rbp), %rax
movsd (%rax), %xmm0 ## xmm0 = mem[0],zero
movq -24(%rbp), %rax
addsd (%rax), %xmm0
movsd %xmm0, -8(%rbp)
movsd -8(%rbp), %xmm0 ## xmm0 = mem[0],zero
movsd %xmm0, -56(%rbp)
movq -40(%rbp), %rax
movslq -44(%rbp), %rdx
movq -56(%rbp), %rsi
movq %rsi, (%rax,%rdx,8)
Aucun commentaire:
Enregistrer un commentaire