So for the following code, I see a strange inlining/optimisation artefact.
I'm curious to know if they are "necessary" in some hypothetical scenario I'm not appreciating?
Godbolt: https://godbolt.org/z/M8PW1obE7
#include <cstdint>
#include <stdio.h>
struct ThreadStateLogger
{
static thread_local struct Instance
{
char TLS_byteLoc[3] {' ', 0, 0};
uint8_t TLS_numBytes {1};
// 4 wasted bytes here...
char* TLS_byteLocPtr {TLS_byteLoc};
void Log(char v)
{
TLS_byteLocPtr[0] = v;
}
void Log(char v1, char v2)
{
TLS_byteLocPtr[0] = v1;
if (TLS_numBytes>1)
TLS_byteLocPtr[1] = v2;
}
} instance;
static void Log(char v1, char v2)
{
instance.Log(v1, v2);
}
// static void Log(char v1, char v2)
// {
// instance.TLS_byteLocPtr[0] = v1;
// if (instance.TLS_numBytes>1)
// instance.TLS_byteLocPtr[1] = v2;
// }
};
extern ThreadStateLogger theThreadStateLogger;
int main()
{
ThreadStateLogger::Log('a', 'b');
// printf("Hello world");
ThreadStateLogger::Log('c', 'd');
return 0;
}
The whole primary implementation gets inlined with -O3, which is what I want :-)
So it appears that the first Log() call correctly checks if this TLS needs allocating, and then gets the converted address using __tls_get_addr@PLT, which is all good.
The second Log() call also apparently checks if the object needs initialising, but then uses the cached addresses from the first call (rbx)! So if it did initialise, that could be wrong?
Below is the output from clang16 on godbolt, which is comparable to gcc - the same reinitialization test and cached address, but it does somewhat better than my current clang10 with -fPIC. https://godbolt.org/z/M8PW1obE7
main: # @main
push rbx
cmp qword ptr [rip + _ZTHN17ThreadStateLogger8instanceE@GOTPCREL], 0
je .LBB0_2
call TLS init function for ThreadStateLogger::instance@PLT
.LBB0_2:
data16
lea rdi, [rip + ThreadStateLogger::instance@TLSGD]
data16
data16
rex64
call __tls_get_addr@PLT
mov rbx, rax
mov rax, qword ptr [rax + 8]
mov byte ptr [rax], 97
cmp byte ptr [rbx + 3], 2
jae .LBB0_3
cmp qword ptr [rip + _ZTHN17ThreadStateLogger8instanceE@GOTPCREL], 0
jne .LBB0_5
.LBB0_6:
mov rax, qword ptr [rbx + 8]
mov byte ptr [rax], 99
cmp byte ptr [rbx + 3], 2
jae .LBB0_7
.LBB0_8:
xor eax, eax
pop rbx
ret
.LBB0_3:
mov rax, qword ptr [rbx + 8]
mov byte ptr [rax + 1], 98
cmp qword ptr [rip + _ZTHN17ThreadStateLogger8instanceE@GOTPCREL], 0
je .LBB0_6
.LBB0_5:
call TLS init function for ThreadStateLogger::instance@PLT
mov rax, qword ptr [rbx + 8]
mov byte ptr [rax], 99
cmp byte ptr [rbx + 3], 2
jb .LBB0_8
.LBB0_7:
mov rax, qword ptr [rbx + 8]
mov byte ptr [rax + 1], 100
xor eax, eax
pop rbx
ret
edits REmoved a printf - MORE init checks have been added, but __tls_get_addr is still cached in rbx
- I will have another "play" to understand what the codepaths are generating now. I may have to introduce some volatiles.
tldr
So reminder of the question: Why is the init check/call repeated? If that is necessary, then why does the address not need to be regenerated? Or is this just an optimisation nobody has thought of doing? Is there any way to get a better optimisation by juggling the code pattern? I have had a few goes to get to here, as you can see.
Aucun commentaire:
Enregistrer un commentaire