lundi 7 janvier 2019

Why is this C++ wrapper class not being inlined away?

This is a very reduced test-case for something I've been having trouble with, where using a numerical wrapper class (which I thought would be inlined away) made my program 10x slower.

This is independent of optimisation level.

Am I missing some detail in my wrapper class?


C++

I have the following program, in which I define a class which wraps a double and provides the + operator:

#include <cstdio>
#include <cstdlib>

#define INLINE __attribute__((always_inline)) inline

struct alignas(8) WrappedDouble {
    double value;

    INLINE friend const WrappedDouble operator+(const WrappedDouble& left, const WrappedDouble& right) {
        return {left.value + right.value};
    };
};

#define doubleType WrappedDouble // either "double" or "WrappedDouble"

int main() {
    int N = 100000000;
    doubleType* arr = (doubleType*)malloc(sizeof(doubleType)*N);
    for (int i = 1; i < N; i++) {
        arr[i] = arr[i - 1] + arr[i];
    }

    free(arr);
    printf("done\n");

    return 0;
}

I thought that this would compile to the same thing - it's doing the same calculations, and everything is inlined.

However, it's not - it produces a larger and slower result, regardless of optimisation level.

(This particular result is not significantly slower, but my actual use-case includes more arithmetic.)


Assembly

I'm using XCode's gcc on my Mac, on a 64-bit system. The result is the same, aside from the body of the for-loop.

Here's what it produces for the body of the loop if doubleType is double:

movq    -16(%rbp), %rax
movl    -20(%rbp), %ecx
subl    $1, %ecx
movslq  %ecx, %rdx
movsd   (%rax,%rdx,8), %xmm0    ## xmm0 = mem[0],zero
movq    -16(%rbp), %rax
movslq  -20(%rbp), %rdx
addsd   (%rax,%rdx,8), %xmm0
movq    -16(%rbp), %rax
movslq  -20(%rbp), %rdx
movsd   %xmm0, (%rax,%rdx,8)

The WrappedDouble version is much longer:

movq    -40(%rbp), %rax
movl    -44(%rbp), %ecx
subl    $1, %ecx
movslq  %ecx, %rdx
shlq    $3, %rdx
addq    %rdx, %rax
movq    -40(%rbp), %rdx
movslq  -44(%rbp), %rsi
shlq    $3, %rsi
addq    %rsi, %rdx
movq    %rax, -16(%rbp)
movq    %rdx, -24(%rbp)
movq    -16(%rbp), %rax
movsd   (%rax), %xmm0           ## xmm0 = mem[0],zero
movq    -24(%rbp), %rax
addsd   (%rax), %xmm0
movsd   %xmm0, -8(%rbp)
movsd   -8(%rbp), %xmm0         ## xmm0 = mem[0],zero
movsd   %xmm0, -56(%rbp)
movq    -40(%rbp), %rax
movslq  -44(%rbp), %rdx
movq    -56(%rbp), %rsi
movq    %rsi, (%rax,%rdx,8)

Aucun commentaire:

Enregistrer un commentaire