I was trying to use std::async in a heavy workload application to improve the performance but I encountered deadlock from time to time. I debugged for a very long time and I am almost certain that my code was fine and it seemed something wrong with std library.
So I wrote a simple test program to testify:
#include <iostream>
#include <vector>
#include <algorithm>
#include <numeric>
#include <future>
#include <string>
#include <mutex>
#include <unistd.h>
#include <atomic>
#include <iomanip>
std::atomic_long numbers[6];
void add(std::atomic_long& n)
{
++n;
}
void func2(std::atomic_long& n)
{
for (auto i = 0L; i < 1000000000000L; ++i)
{
std::async(std::launch::async, [&] {add(n);}); // Small task, I want to run them simultaneously
}
}
int main()
{
std::vector<std::future<void>> results;
for (int i = 0; i < 6; ++i)
{
auto& n = numbers[i];
results.push_back(std::async(std::launch::async, [&n] {func2(n);}));
}
while (true)
{
sleep(1);
for (int i = 0; i < 6; ++i)
std::cout << std::setw(20) << numbers[i] << " ";
std::cout << std::endl;
}
for (auto& r : results)
{
r.wait();
}
return 0;
}
This program will produce output like this:
763700 779819 754005 763287 767713 748994
768822 785172 759678 769393 772956 754469
773529 789382 763524 772704 776398 757864
778560 794419 768580 777507 781542 762991
782056 795578 771704 780554 784865 766162
801633 812610 788111 802617 803661 784894
After a time (minutes or hours), if there was a deadlock, the output will be like this:
4435337 4452421 4507907 4501378 2549550 4462899
4441213 4457648 4514424 4506626 2549550 4468019
4446301 4462675 4519272 4511889 2549550 4473266
4453940 4470304 4526382 4519513 2549550 4480872
4461095 4477708 4533272 4526901 2549550 4488313
4470974 4488287 4543442 4537286 2549550 4498733
The fifth column was frozen.
After one day, it became this:
23934912 23967635 24007250 23931203 2549550 3249788689
23934912 23967635 24007250 23931203 2549550 3249816818
23934912 23967635 24007250 23931203 2549550 3249835009
23934912 23967635 24007250 23931203 2549550 3249860262
23934912 23967635 24007250 23931203 2549550 3249894331
Almost all columns froze except last column. It look really odd.
I ran it on Linux, macOS, FreeBSD, and the result was:
- macOS:10.15.2, Clang:11.0.0, no deadlock
- FreeBSD:12.0, Clang:6.0.1, deadlock
- Linux: ubuntu 5.0.0-37, g++:7.4.0, no deadlock
- Linux: ubuntu 4.4.0-21, Clang:3.8.0, deadlock
In the gdb, the call stack was:
(gdb) thread apply all bt
Thread 10 (LWP 100467 of process 37763):
#0 0x000000080025c630 in ?? () from /lib/libthr.so.3
#1 0x0000000000000000 in ?? ()
Backtrace stopped: Cannot access memory at address 0x7fff4ad57000
Thread 9 (LWP 100464 of process 37763):
#0 0x000000080046fafa in _umtx_op () from /lib/libc.so.7
#1 0x0000000800264912 in ?? () from /lib/libthr.so.3
#2 0x000000080031f9f9 in std::__1::mutex::unlock() () from /usr/lib/libc++.so.1
#3 0x00000008002e8f55 in std::__1::__assoc_sub_state::set_value() () from /usr/lib/libc++.so.1
#4 0x00000000002053e1 in std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::__execute() ()
#5 0x0000000000205763 in void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::*)(), std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >*> >(void*) ()
#6 0x000000080025c776 in ?? () from /lib/libthr.so.3
#7 0x0000000000000000 in ?? ()
Backtrace stopped: Cannot access memory at address 0x7fff6944a000
Thread 8 (LWP 100431 of process 37763):
#0 0x000000080046fafa in _umtx_op () from /lib/libc.so.7
#1 0x0000000800264912 in ?? () from /lib/libthr.so.3
#2 0x000000080031f9f9 in std::__1::mutex::unlock() () from /usr/lib/libc++.so.1
#3 0x00000008002e8f55 in std::__1::__assoc_sub_state::set_value() () from /usr/lib/libc++.so.1
#4 0x00000000002053e1 in std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::__execute() ()
#5 0x0000000000205763 in void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::*)(), std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >*> >(void*) ()
#6 0x000000080025c776 in ?? () from /lib/libthr.so.3
#7 0x0000000000000000 in ?? ()
Backtrace stopped: Cannot access memory at address 0x7fffc371a000
Thread 7 (LWP 100657 of process 37763):
#0 0x000000080026a66c in ?? () from /lib/libthr.so.3
#1 0x000000080025e731 in ?? () from /lib/libthr.so.3
#2 0x0000000800268388 in ?? () from /lib/libthr.so.3
#3 0x000000080032de72 in std::__1::condition_variable::wait(std::__1::unique_lock<std::__1::mutex>&) () from /usr/lib/libc++.so.1
#4 0x00000008002e971b in std::__1::__assoc_sub_state::wait() () from /usr/lib/libc++.so.1
#5 0x0000000000205389 in std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::__on_zero_shared() ()
#6 0x000000000020346b in func2(std::__1::atomic<long>&) ()
#7 0x0000000000206f18 in main::$_1::operator()() const ()
#8 0x0000000000206eed in void std::__1::__async_func<main::$_1>::__execute<>(std::__1::__tuple_indices<>) ()
#9 0x0000000000206ea5 in std::__1::__async_func<main::$_1>::operator()() ()
#10 0x0000000000206df3 in std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::__execute() ()
#11 0x0000000000207183 in void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::*)(), std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >*> >(void*) ()
#12 0x000000080025c776 in ?? () from /lib/libthr.so.3
#13 0x0000000000000000 in ?? ()
Backtrace stopped: Cannot access memory at address 0x7fffdf5f9000
Thread 6 (LWP 100656 of process 37763):
#0 0x000000080026a66c in ?? () from /lib/libthr.so.3
#1 0x000000080025e731 in ?? () from /lib/libthr.so.3
#2 0x0000000800268388 in ?? () from /lib/libthr.so.3
#3 0x000000080032de72 in std::__1::condition_variable::wait(std::__1::unique_lock<std::__1::mutex>&) () from /usr/lib/libc++.so.1
#4 0x00000008002e971b in std::__1::__assoc_sub_state::wait() () from /usr/lib/libc++.so.1
#5 0x0000000000205389 in std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::__on_zero_shared() ()
#6 0x0000000000207a22 in std::__1::__release_shared_count::operator()(std::__1::__shared_count*) ()
#7 0x00000000002044f4 in std::__1::future<void> std::__1::__make_async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >(std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0>&&) ()
#8 0x00000000002035ea in std::__1::future<std::__1::__invoke_of<std::__1::decay<func2(std::__1::atomic<long>&)::$_0>::type>::type> std::__1::async<func2(std::__1::atomic<long>&)::$_0>(std::__1::launch, func2(std::__1::atomic<long>&)::$_0&&) ()
#9 0x0000000000203462 in func2(std::__1::atomic<long>&) ()
#10 0x0000000000206f18 in main::$_1::operator()() const ()
#11 0x0000000000206eed in void std::__1::__async_func<main::$_1>::__execute<>(std::__1::__tuple_indices<>) ()
#12 0x0000000000206ea5 in std::__1::__async_func<main::$_1>::operator()() ()
#13 0x0000000000206df3 in std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::__execute() ()
#14 0x0000000000207183 in void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::*)(), std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >*> >(void*) ()
#15 0x000000080025c776 in ?? () from /lib/libthr.so.3
#16 0x0000000000000000 in ?? ()
Backtrace stopped: Cannot access memory at address 0x7fffdf7fa000
Thread 5 (LWP 100655 of process 37763):
#0 0x000000080026a66c in ?? () from /lib/libthr.so.3
#1 0x000000080025e731 in ?? () from /lib/libthr.so.3
#2 0x0000000800268388 in ?? () from /lib/libthr.so.3
#3 0x000000080032de72 in std::__1::condition_variable::wait(std::__1::unique_lock<std::__1::mutex>&) () from /usr/lib/libc++.so.1
#4 0x00000008002e971b in std::__1::__assoc_sub_state::wait() () from /usr/lib/libc++.so.1
#5 0x0000000000205389 in std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::__on_zero_shared() ()
#6 0x0000000000207a22 in std::__1::__release_shared_count::operator()(std::__1::__shared_count*) ()
#7 0x00000000002044f4 in std::__1::future<void> std::__1::__make_async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >(std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0>&&) ()
#8 0x00000000002035ea in std::__1::future<std::__1::__invoke_of<std::__1::decay<func2(std::__1::atomic<long>&)::$_0>::type>::type> std::__1::async<func2(std::__1::atomic<long>&)::$_0>(std::__1::launch, func2(std::__1::atomic<long>&)::$_0&&) ()
#9 0x0000000000203462 in func2(std::__1::atomic<long>&) ()
#10 0x0000000000206f18 in main::$_1::operator()() const ()
#11 0x0000000000206eed in void std::__1::__async_func<main::$_1>::__execute<>(std::__1::__tuple_indices<>) ()
#12 0x0000000000206ea5 in std::__1::__async_func<main::$_1>::operator()() ()
#13 0x0000000000206df3 in std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::__execute() ()
#14 0x0000000000207183 in void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::*)(), std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >*> >(void*) ()
#15 0x000000080025c776 in ?? () from /lib/libthr.so.3
#16 0x0000000000000000 in ?? ()
Backtrace stopped: Cannot access memory at address 0x7fffdf9fb000
Thread 4 (LWP 100654 of process 37763):
#0 0x000000080026a66c in ?? () from /lib/libthr.so.3
#1 0x000000080025e731 in ?? () from /lib/libthr.so.3
#2 0x0000000800268388 in ?? () from /lib/libthr.so.3
#3 0x000000080032de72 in std::__1::condition_variable::wait(std::__1::unique_lock<std::__1::mutex>&) () from /usr/lib/libc++.so.1
#4 0x00000008002e971b in std::__1::__assoc_sub_state::wait() () from /usr/lib/libc++.so.1
#5 0x0000000000205389 in std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::__on_zero_shared() ()
#6 0x0000000000207a22 in std::__1::__release_shared_count::operator()(std::__1::__shared_count*) ()
#7 0x00000000002044f4 in std::__1::future<void> std::__1::__make_async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >(std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0>&&) ()
#8 0x00000000002035ea in std::__1::future<std::__1::__invoke_of<std::__1::decay<func2(std::__1::atomic<long>&)::$_0>::type>::type> std::__1::async<func2(std::__1::atomic<long>&)::$_0>(std::__1::launch, func2(std::__1::atomic<long>&)::$_0&&) ()
#9 0x0000000000203462 in func2(std::__1::atomic<long>&) ()
#10 0x0000000000206f18 in main::$_1::operator()() const ()
#11 0x0000000000206eed in void std::__1::__async_func<main::$_1>::__execute<>(std::__1::__tuple_indices<>) ()
#12 0x0000000000206ea5 in std::__1::__async_func<main::$_1>::operator()() ()
#13 0x0000000000206df3 in std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::__execute() ()
#14 0x0000000000207183 in void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::*)(), std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >*> >(void*) ()
#15 0x000000080025c776 in ?? () from /lib/libthr.so.3
#16 0x0000000000000000 in ?? ()
Backtrace stopped: Cannot access memory at address 0x7fffdfbfc000
Thread 3 (LWP 100653 of process 37763):
#0 0x000000080026a66c in ?? () from /lib/libthr.so.3
#1 0x000000080025e731 in ?? () from /lib/libthr.so.3
#2 0x0000000800268388 in ?? () from /lib/libthr.so.3
#3 0x000000080032de72 in std::__1::condition_variable::wait(std::__1::unique_lock<std::__1::mutex>&) () from /usr/lib/libc++.so.1
#4 0x00000008002e971b in std::__1::__assoc_sub_state::wait() () from /usr/lib/libc++.so.1
#5 0x0000000000205389 in std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::__on_zero_shared() ()
#6 0x0000000000207a22 in std::__1::__release_shared_count::operator()(std::__1::__shared_count*) ()
#7 0x00000000002044f4 in std::__1::future<void> std::__1::__make_async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >(std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0>&&) ()
#8 0x00000000002035ea in std::__1::future<std::__1::__invoke_of<std::__1::decay<func2(std::__1::atomic<long>&)::$_0>::type>::type> std::__1::async<func2(std::__1::atomic<long>&)::$_0>(std::__1::launch, func2(std::__1::atomic<long>&)::$_0&&) ()
#9 0x0000000000203462 in func2(std::__1::atomic<long>&) ()
#10 0x0000000000206f18 in main::$_1::operator()() const ()
#11 0x0000000000206eed in void std::__1::__async_func<main::$_1>::__execute<>(std::__1::__tuple_indices<>) ()
#12 0x0000000000206ea5 in std::__1::__async_func<main::$_1>::operator()() ()
#13 0x0000000000206df3 in std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::__execute() ()
#14 0x0000000000207183 in void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::*)(), std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >*> >(void*) ()
#15 0x000000080025c776 in ?? () from /lib/libthr.so.3
#16 0x0000000000000000 in ?? ()
Backtrace stopped: Cannot access memory at address 0x7fffdfdfd000
Thread 2 (LWP 100652 of process 37763):
#0 0x000000080026a66c in ?? () from /lib/libthr.so.3
#1 0x000000080025e731 in ?? () from /lib/libthr.so.3
#2 0x0000000800268388 in ?? () from /lib/libthr.so.3
#3 0x000000080032de72 in std::__1::condition_variable::wait(std::__1::unique_lock<std::__1::mutex>&) () from /usr/lib/libc++.so.1
#4 0x00000008002e971b in std::__1::__assoc_sub_state::wait() () from /usr/lib/libc++.so.1
#5 0x0000000000205389 in std::__1::__async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >::__on_zero_shared() ()
#6 0x0000000000207a22 in std::__1::__release_shared_count::operator()(std::__1::__shared_count*) ()
#7 0x00000000002044f4 in std::__1::future<void> std::__1::__make_async_assoc_state<void, std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0> >(std::__1::__async_func<func2(std::__1::atomic<long>&)::$_0>&&) ()
#8 0x00000000002035ea in std::__1::future<std::__1::__invoke_of<std::__1::decay<func2(std::__1::atomic<long>&)::$_0>::type>::type> std::__1::async<func2(std::__1::atomic<long>&)::$_0>(std::__1::launch, func2(std::__1::atomic<long>&)::$_0&&) ()
#9 0x0000000000203462 in func2(std::__1::atomic<long>&) ()
#10 0x0000000000206f18 in main::$_1::operator()() const ()
#11 0x0000000000206eed in void std::__1::__async_func<main::$_1>::__execute<>(std::__1::__tuple_indices<>) ()
#12 0x0000000000206ea5 in std::__1::__async_func<main::$_1>::operator()() ()
#13 0x0000000000206df3 in std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::__execute() ()
#14 0x0000000000207183 in void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >::*)(), std::__1::__async_assoc_state<void, std::__1::__async_func<main::$_1> >*> >(void*) ()
#15 0x000000080025c776 in ?? () from /lib/libthr.so.3
#16 0x0000000000000000 in ?? ()
Backtrace stopped: Cannot access memory at address 0x7fffdfffe000
Thread 1 (LWP 100148 of process 37763):
#0 0x00000008004f984a in _nanosleep () from /lib/libc.so.7
#1 0x000000080025f17c in ?? () from /lib/libthr.so.3
#2 0x000000080045fe0b in sleep () from /lib/libc.so.7
#3 0x0000000000203b7b in main ()
It seems lots of threads got stuck on std::__1::condition_variable::wait
, which is unreasonable, in the test code, there is no use of any condition at all.
Can somebody tell me, am I doing it wrong or there is a bug in the std library?