I implemented a parallel_accumulate, similar to std::acumulate but using threads for parallelizing accumulation work. There are two versions of std::accumulate, the second taking an additional operator for accumulation, while the first implicitly assumes addition.. first version:
template<typename Iterator, typename T>
static T parallel_accumulate(Iterator first, Iterator last, T init)
{
unsigned long const length = std::distance(first, last);
if (!length) return init;
unsigned long const max_ths = 64;
unsigned long const hw_ths = std::thread::hardware_concurrency();
unsigned long const num_ths = std::min(hw_ths != 0 ? hw_ths : 2, max_threads);
unsigned long const block_size = length / num_ths;
std::vector<T> results(num_threads);
std::vector<std::thread> threads(num_threads);
Iterator block_start = first;
for (unsigned long i = 0; i < num_ths; ++i) {
Iterator block_end = block_start;
std::advance(block_end, block_size);
threads[i] = std::thread( // diff1
_noop<Iterator,T>(),
block_start,
block_end,
std::ref(results[i]));
block_start = block_end;
}
std::for_each(threads.begin(), threads.end(),std::mem_fn(&std::thread::join));
return std::accumulate(results.begin(), results.end(), init); // diff2
}
Second version:
template<typename Iterator, typename T, typename BinOp>
static T parallel_accumulate(Iterator first, Iterator last, T init, BinOp op)
{
unsigned long const length = std::distance(first, last);
if (!length) return init;
unsigned long const max_ths = 64;
unsigned long const hw_ths = std::thread::hardware_concurrency();
unsigned long const num_ths = std::min(hw_ths != 0 ? hw_ths : 2, max_threads);
unsigned long const block_size = length / num_ths;
std::vector<T> results(num_threads);
std::vector<std::thread> threads(num_threads);
Iterator block_start = first;
for (unsigned long i = 0; i < num_ths; ++i) {
Iterator block_end = block_start;
std::advance(block_end, block_size);
threads[i] = std::thread( // diff1
_op<Iterator,T,BinOp>(),
block_start,
block_end,
std::ref(results[i]));
block_start = block_end;
}
std::for_each(threads.begin(), threads.end(),std::mem_fn(&std::thread::join));
return std::accumulate(results.begin(), results.end(), init, op); // diff2
}
As you can easily see my implementations of the two versions only differ on two lines, how would I avoid duplicating this code in C++ elegantly (i.e. not using #defines)?
(Note that I omitted _noop and _op for clarity, they are simply callable structs calling the respective versions of std::accumulate)
Aucun commentaire:
Enregistrer un commentaire