Refering to a c++11 multi-threading example, I try to ues multi-threading to compute the vector dot_product result. The basic idea here is that we break the vector into two parts, and concurrent compute the partial_sum of each part. And do the summation after synchronize the two threads taks. Here, I only use CPU and RAM resources. Besides, I try create a big vector in order to cover the thread schedule ovehead.
The problem is here that two threads is even slower than single thread. But the CPU utilization is much higher when using two threads. Hence, I believe two physical cores are used by the program.
Any help would be greatly appreciated.
Here is my thread implementation:
#include<iostream>
#include<thread>
#include<vector>
#include<mutex>
#include<ctime>
#include<numeric>
#include<iterator>
using namespace std;
vector<int> boundary(int num, int parts)
{
vector<int >bounds;
int delta = num / parts;
int remainder = num % parts;
int prev = 0, next = 0;
bounds.push_back(prev);
for(int i = 0; i < parts; i++)
{
next = prev + delta;
if(i == parts - 1)
next = next + remainder;
bounds.push_back(next);
prev = next;
}
return bounds;
}
void dot_product(const vector<int>& v1, const vector<int>& v2, int& result, int L, int R, int tid)
{
int partial_sum = 0;
for(int i = L; i < R; i++)
partial_sum += v1[i] * v2[i];
//lock_guard<mutex> lock(barrier);
//cout << "tid: " << tid<< endl;
result = partial_sum;
}
int main(int argc, char* argv[])
{
clock_t start, end;
int numOfElement = 500000000;
int numOfThread = 1;
int result[numOfThread] = {0};
vector<thread> threads;
// Fill two vectors with some values
vector<int> v1(numOfElement, 1), v2(numOfElement, 2);
// Split numOfElement into nr_threads parts
vector<int> limits = boundary(numOfElement, numOfThread);
start = clock();
// Launch multi_threads:
for(int i = 0; i < numOfThread; i++)
threads.push_back(thread(dot_product, ref(v1), ref(v2), ref(result[i]), limits[i], limits[i+1], i));
// Join the threads with the main thread
for(auto &t:threads)
t.join();
int sum = accumulate(result, result+numOfThread, 0);
end = clock();
//cout << limits[0] <<" " << limits[1]<<" "<<limits[2]<<endl;
cout << "results: " << sum << " time elapsed: "<< double(end - start) / CLOCKS_PER_SEC << endl;
return 0;
}
Platform:
OS: Win8 64bit
CPU: I3-3220 (2C4T)
RAM: 12G
IDE: Dev-C++ 5.11, TDM-GCC 4.9.2 release
Results so far:
1 Thread: 14.42 seconds, CPU: 60%, RAM: 7.9G
2 Threads: 19.65 seconds, CPU: 82%, RAM: 7.9G
4 Threads: 22.33 seconds, CPU: 99%, RAM: 7.9G
Aucun commentaire:
Enregistrer un commentaire