I have some code that is morally like this:
#include <bitset>
enum Flags { A = 1, B = 2, C = 3, D = 5,
E = 8, F = 13, G = 21, H,
I, J, K, L, M, N, O };
void apply_known_mask(std::bitset<64> &bits) {
const Flags important_bits[] = { B, D, E, H, K, M, L, O };
std::remove_reference<decltype(bits)>::type mask{};
for (const auto& bit : important_bits) {
mask.set(bit);
}
bits &= mask;
}
Clang >= 3.6 does the smart thing and compiles this to a single and
instruction (which then gets inlined everywhere else):
apply_known_mask(std::bitset<64ul>&): # @apply_known_mask(std::bitset<64ul>&)
and qword ptr [rdi], 775946532
ret
But every version of GCC I've tried compiles this to an enormous mess that includes error handling that should be statically DCE'd. In other code, it will even place the important_bits
equivalent as data in line with the code!
.LC0:
.string "bitset::set"
.LC1:
.string "%s: __position (which is %zu) >= _Nb (which is %zu)"
apply_known_mask(std::bitset<64ul>&):
sub rsp, 40
xor esi, esi
mov ecx, 2
movabs rax, 21474836482
mov QWORD PTR [rsp], rax
mov r8d, 1
movabs rax, 94489280520
mov QWORD PTR [rsp+8], rax
movabs rax, 115964117017
mov QWORD PTR [rsp+16], rax
movabs rax, 124554051610
mov QWORD PTR [rsp+24], rax
mov rax, rsp
jmp .L2
.L3:
mov edx, DWORD PTR [rax]
mov rcx, rdx
cmp edx, 63
ja .L7
.L2:
mov rdx, r8
add rax, 4
sal rdx, cl
lea rcx, [rsp+32]
or rsi, rdx
cmp rax, rcx
jne .L3
and QWORD PTR [rdi], rsi
add rsp, 40
ret
.L7:
mov ecx, 64
mov esi, OFFSET FLAT:.LC0
mov edi, OFFSET FLAT:.LC1
xor eax, eax
call std::__throw_out_of_range_fmt(char const*, ...)
My question is: how should I write this code so that both compilers can do the right thing? Failing that, how should I write this so that it remains clear, fast, and maintainable?
Aucun commentaire:
Enregistrer un commentaire