The following snippet takes a command line parameter that represents the number of threads to spawn to run a simple for loop concurrently.
If the arguments passed is 0, no std::thread
is spawned.
On gcc 4.9.2, ./snippet 0
takes 10% longer than ./snippet 1
in average, i.e. the version that spawns one std::thread
to execute the loop is faster than the version that just executes the loop inside main
.
Does anyone know what's going on? clang-4 does not show this behaviour at all (version with one std::thread
is slower), gcc 6.2 has the version with one std::thread
run just slightly faster (when taking the minimum time spent over ten trials as the measured value).
Here is the snippet: ScopedNanoTimer
is just a simple RAII timer. I am compiling with -g -O3 -pthread -std=c++11
.
#include <thread>
#include <chrono>
#include <iostream>
#include <vector>
class ScopedNanoTimer {
const std::chrono::high_resolution_clock::time_point t0;
void (*cb)(long long int);
public:
ScopedNanoTimer(void (*callback)(long long int)) : t0(std::chrono::high_resolution_clock::now()), cb(callback) {}
~ScopedNanoTimer()
{
const auto t1 = std::chrono::high_resolution_clock::now();
const auto nanos = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
cb(nanos);
}
};
int main(int argc, char** argv) {
// setup
if (argc < 2) {
std::cerr << "usage: " << argv[0] << " <n_threads>\n";
std::cerr << "n_threads == 0 indicates completely sequential execution\n";
return 1;
}
const unsigned n_threads = std::atoi(argv[1]);
const auto n_iterations = 1000000000ul / (n_threads == 0u ? 1u : n_threads);
// define workload
auto task = [n_iterations]() {
volatile auto sum = 0ul;
for (auto i = 0ul; i < n_iterations; ++i) ++sum;
};
// time and print
for (auto i = 0u; i < 10; ++i) {
if (n_threads == 0) {
ScopedNanoTimer timer([](long long int ns) { std::cout << ns << " "; });
task();
} else {
std::vector<std::thread> threads;
ScopedNanoTimer timer([](long long int ns) { std::cout << ns << " "; });
for (auto i = 0u; i < n_threads; ++i) threads.emplace_back(task);
for (auto &thread : threads) thread.join();
}
}
std::cout << std::endl;
return 0;
}
Aucun commentaire:
Enregistrer un commentaire