I am writing a simple benchmarking application to assess/compare multithreading performance of pthreads and OpenMP. To do so I use the Monte Carlo Method to calculate Pi, distributing the iterations over a number of threads.
I wrote a small and simple wrapper Class around the pthreads interface that just creates a thread when an object is constructed and stores the thread id.
I have used a code snippet I found somewhere on this site as a supposedly thread safe random number generator.
Now for my problem: When I run this code with just the main thread or the main thread + one extra thread the precision of the result increases as I increase the number of iterations, just as expected. But if I increase the number of threads to three or four, suddenly I get much less precision. For example if I run 10000 iterations on one thread I get a correct result to the 4th digit and if I increase the number of threads it suddenly is only correct to the second digit.
I have found that adding a delay between the creation of threads remedies the problem, but I don't really understand why.
Is there something wrong with the way I use the random number generator?
Thanks in advance.
void* monteCarlo_intern(void* args){
struct thread_arguments* arguments = (struct thread_arguments*) args;
long unsigned int total=0;
long unsigned int inside=0;
thread_local std::mt19937_64 generator(std::random_device{}());
thread_local std::uniform_real_distribution<double> distribution(0.0, 1.0);
double d1 = 0, d2 = 0;
for(total = 0; total < arguments -> iterations; total++) {
d1 = distribution(generator);
d2 = distribution(generator);
if((d1*d1 + d2*d2) <= 1.0) {inside++;}
}
*arguments->inside = inside * 4;
}
double calcPI_monteCarlo(struct for_arguments args_in){
std::vector <long unsigned int> inside_vec(args_in.no_threads - 1, 0); //Vector to hold results per thread
std::vector<Thread> threads; //Vector to hold threads
const unsigned long int iters_per_t = args_in.iterations / args_in.no_threads;
std::vector<thread_arguments> args(args_in.no_threads - 1, {NULL, iters_per_t});
for (unsigned int i = 0; i < args_in.no_threads - 1; i++)
{
args[i].inside = &inside_vec[i]; //Assign results-vector adress to thread to capture result
threads.emplace_back(monteCarlo_intern, &args[i]); //Create thread and store it in vector
}
long unsigned int inside = 0;
thread_arguments local_args = {&inside, iters_per_t};
monteCarlo_intern(&local_args); //Do the same calculation locally to reduce number of spawned threads
for (unsigned int i = 0; i < args_in.no_threads - 1; i++)
{
pthread_join(threads[i].get_id(), NULL);
inside += inside_vec[i];
}
long unsigned int total = iters_per_t * args_in.no_threads;
return (double)inside / total;
}
double get_double(void){
thread_local std::mt19937_64 generator(std::random_device{}());
std::uniform_real_distribution<double> distribution(0.0, 1.0);
return distribution(generator);
}
double calcPI_monteCarlo_openMP(const struct for_arguments args_in){
long unsigned int total=0;
long unsigned int inside=0;
#pragma omp parallel for num_threads(args_in.no_threads) shared(inside, total)
for(long unsigned int i = 0; i < args_in.iterations; i++)
{
double d1 = get_double();
double d2 = get_double();
if((d1*d1 + d2*d2) <= 1.0) {inside++;}
total++;
}
return (double) (4 * inside) / total;
}
Aucun commentaire:
Enregistrer un commentaire