samedi 4 janvier 2020

Why do the following two code snippets result in different compilation output with MSVC, while same with gcc?

This question is about why it seems that MSVC fails a very basic optimization with which gcc has no struggle. I'm wondering whether I missed something here and what the reason could be.

The following code snippet:

#include <vector>

struct X {
    int x;
    double y;
};

void for_1(std::vector<X>& x) {
    for (auto& y : x) {
        y.x = 1;
    }
}

void for_2(std::vector<X>& x) {
    for (auto it = x.begin(); it != x.end(); ++ it) {
        it->x = 1;
    }
}

When compiled with MSVC with optimization on (-O2), it compiles the two functions to different assemblies (Note the additional mov in for_1. For full output, see godbolt):

x$ = 8
void for_2(std::vector<X,std::allocator<X> > &) PROC ; for_2, COMDAT
        mov     rax, QWORD PTR [rcx]
        cmp     rax, QWORD PTR [rcx+8]
        je      SHORT $LN3@for_2
        npad    7
$LL4@for_2:
        mov     DWORD PTR [rax], 1
        add     rax, 16
        cmp     rax, QWORD PTR [rcx+8]
        jne     SHORT $LL4@for_2
$LN3@for_2:
        ret     0
void for_2(std::vector<X,std::allocator<X> > &) ENDP ; for_2

x$ = 8
void for_1(std::vector<X,std::allocator<X> > &) PROC ; for_1, COMDAT
        mov     rdx, QWORD PTR [rcx+8]
        mov     rax, QWORD PTR [rcx]
        cmp     rax, rdx
        je      SHORT $LN3@for_1
        npad    4
$LL4@for_1:
        mov     DWORD PTR [rax], 1
        add     rax, 16
        cmp     rax, rdx
        jne     SHORT $LL4@for_1
$LN3@for_1:
        ret     0
void for_1(std::vector<X,std::allocator<X> > &) ENDP ; for_1
...

But with gcc 9.2 (-O3), the output are the same:

for_1(std::vector<X, std::allocator<X> >&):
        mov     rax, QWORD PTR [rdi]
        mov     rdx, QWORD PTR [rdi+8]
        cmp     rax, rdx
        je      .L1
.L3:
        mov     DWORD PTR [rax], 1
        add     rax, 16
        cmp     rdx, rax
        jne     .L3
.L1:
        ret
for_2(std::vector<X, std::allocator<X> >&):
        mov     rdx, QWORD PTR [rdi+8]
        mov     rax, QWORD PTR [rdi]
        cmp     rax, rdx
        je      .L6
.L8:
        mov     DWORD PTR [rax], 1
        add     rax, 16
        cmp     rdx, rax
        jne     .L8
.L6:
        ret

To me, it seems pretty rudimentary to optimize both functions to the same assemblies, but MSVC for some reason fails here. Why is this? Am I missing something?

Aucun commentaire:

Enregistrer un commentaire