I wrote a program which use std::thread::hardware_concurrency
to get how much threads my computer could support.Then I divide the size of array by N and get N blocks. And I create N threads to calculate the sum of the block.Here is the code
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <numeric>
#include <thread>
#include <vector>
#include <stdlib.h>
int64_t thread_cost_time = 0;
template <typename Iterator, typename T> struct accumulate_block {
void operator()(Iterator first, Iterator last, T &result) {
using namespace std::chrono;
auto start = std::chrono::high_resolution_clock::now();
result = std::accumulate(first, last, result);
auto stop = std::chrono::high_resolution_clock::now();
auto thread_time =
std::chrono::duration_cast<microseconds>(stop - start).count();
thread_cost_time = std::max(thread_time, thread_cost_time);
}
};
template <typename Iterator, typename T>
T parallel_accumulate(Iterator first, Iterator last, T &init, uint64_t num) {
uint64_t length = std::distance(first, last);
const uint64_t min_per_thread = 25;
// it will assign 12 to hard_ware_threads in my pc
const uint64_t hardware_threads = std::thread::hardware_concurrency();
const uint64_t max_threads = (length + min_per_thread - 1) / (min_per_thread);
// const uint64_t num_threads = std::min(hardware_threads != 0 ?
// hardware_threads : 2,
// max_threads);
const uint64_t num_threads = num;
const uint64_t block_size = length / num_threads;
std::vector<T> results(num_threads);
std::vector<std::thread> threads(num_threads - 1);
Iterator block_start = first;
for (uint64_t i = 0; i < num_threads - 1; i++) {
Iterator block_end = block_start;
std::advance(block_end, block_size);
// calculate the sum of block
threads[i] = std::thread{accumulate_block<Iterator, T>(), block_start,
block_end, std::ref(results[i])};
block_start = block_end;
}
accumulate_block<Iterator, T>()(block_start, last, results[num_threads - 1]);
std::for_each(threads.begin(), threads.end(),
std::mem_fn(&std::thread::join));
return std::accumulate(results.begin(), results.end(), init);
}
int main(int argc, char *argv[]) {
// constexpr const uint64_t sz = 1000000000;
for (int number = 2; number < 32; number++) {
int64_t parr = 0;
int64_t single = 0;
int64_t thread_trivial = 0;
std::cout
<< "--------------------------------------------------------------"
<< std::endl;
std::cout << "---------------------thread: " << number
<< "-----------------------" << std::endl;
int iter_times = 10;
for (int iter = 0; iter < iter_times; iter++) {
thread_cost_time = 0;
constexpr const uint64_t sz = 100000000 ;
std::vector<uint64_t> arr;
for (uint32_t i = 0; i < sz; i++) {
arr.emplace_back(i);
}
using namespace std::chrono;
auto start = std::chrono::high_resolution_clock::now();
uint64_t init = 0;
parallel_accumulate<decltype(arr.begin()), uint64_t>(
arr.begin(), arr.end(), std::ref(init), number);
auto stop = std::chrono::high_resolution_clock::now();
parr += std::chrono::duration_cast<microseconds>(stop - start).count();
thread_trivial +=
std::chrono::duration_cast<microseconds>(stop - start).count() -
thread_cost_time;
uint64_t init_ = 0;
uint64_t arr_sz = arr.size();
// uint64_t block_sz = arr.size() / 2;
start = std::chrono::high_resolution_clock::now();
std::accumulate(arr.begin(), arr.end(), init_);
// std::cout << init_ << std::endl;
stop = std::chrono::high_resolution_clock::now();
single += std::chrono::duration_cast<microseconds>(stop - start).count();
}
std::cout << "parallel " << parr / iter_times<< std::endl;
std::cout << "single thread " << single / iter_times<< std::endl;
std::cout << "parr is "
<< static_cast<double>(single) / static_cast<double>(parr)
<< "X fast" << std::endl;
std::cout << "thread create and destory time " << thread_trivial / iter_times
<< std::endl;
}
}
I record the time of multithread and single thread.
I can only achieve at most 6.57x faster than use only one thread, even though std::thread::hardware_concurrency
tell me I have 12 threads could run simultaneously.
There were no contention of lock in this program.I also record the time of create and destory the thread, even if I minus it , I still cannot achieve 12X faster.
I think maybe thread schedule will make multithreads slow, but I have 12 threads, It shouldn't achieve only 6.57x faster. I think maybe multithreads will decrease the hit ratio of cache,but I'm not quite sure. So how can I achieve 12X faster than use only one thread?
Here is my static of my program
threads | parallel | single | faster |
---|---|---|---|
2 | 324868 | 633777 | 1.95 |
3 | 218584 | 633777 | 2.87 |
4 | 167169 | 633777 | 3.77 |
5 | 136542 | 633777 | 4.64 |
6 | 113207 | 633777 | 5.48 |
7 | 147324 | 633777 | 4.27 |
8 | 136768 | 633777 | 4.67 |
You could run my code to get the data from 2 threads to 31 threads
Aucun commentaire:
Enregistrer un commentaire