lundi 19 avril 2021

Using OpenMP multithread is much slower than single thread on MacOS

I am trying to parallel my C++ Neural Network Training Process using OpenMP. But it won't work.
And then I used a simple C++ code with nested loops to test the OpenMP.
But it is much slower with OpenMP multithread than single thread.
Did I do something wrong to make it slower? Or did I miss something?

System

MacOS 4 cores

Language

C++

Time functions

I used both high_resolution_clock::now() and omp_get_wtime().

  1. std::chrono::high_resolution_clock::now();

single thread cost time: 0.00000000000000
2 threads cost time: 0.00010013580322
4 threads cost time: 0.00016403198242
6 threads cost time: 0.00017309188843
8 threads cost time: 0.00112605094910
10 threads cost time: 0.00013613700867
12 threads cost time: 0.00082898139954

  1. omp_get_wtime();

single thread cost time: 0.00000005900000
2 threads cost time: 0.00009907600000
4 threads cost time: 0.00018207300000
6 threads cost time: 0.00014479500000
8 threads cost time: 0.00070604400000
10 threads cost time: 0.00057277700000
12 threads cost time: 0.00074358000000

Code

#include <iostream>
#include <omp.h>
#include <chrono>
#include <iomanip>

using namespace std;
void test() {
    int j = 0;
    for (int i = 0; i < 100000; i++) {
        // do something to kill time...
        j++;
    }
};

int main()
{
    auto startTime = chrono::high_resolution_clock::now();
    auto endTime = chrono::high_resolution_clock::now();

    // without openMp
    startTime = chrono::high_resolution_clock::now();
    for (int i = 0; i < 100000; i++) {
        test();
    }
    endTime = chrono::high_resolution_clock::now();
    chrono::duration<double> diff = endTime - startTime;
    cout << setprecision(14) << fixed;
    cout << "single thread cost time: " << diff.count() << endl;

    // 2 threads
    startTime = chrono::high_resolution_clock::now();
    #pragma omp parallel for num_threads(2)
    for (int i = 0; i < 100000; i++) {
        test();
    }
    endTime = chrono::high_resolution_clock::now();
    diff = endTime - startTime;
    cout << "2 threads cost time: " << diff.count() << endl;

    // 4 threads
    startTime = chrono::high_resolution_clock::now();
    #pragma omp parallel for num_threads(4)
    for (int i = 0; i < 100000; i++) {
        test();
    }
    endTime = chrono::high_resolution_clock::now();
    diff = endTime - startTime;
    cout << "4 threads cost time: " << diff.count() << endl;

    // 6 threads
    startTime = chrono::high_resolution_clock::now();
    #pragma omp parallel for num_threads(6)
    for (int i = 0; i < 100000; i++) {
        test();
    }
    endTime = chrono::high_resolution_clock::now();
    diff = endTime - startTime;
    cout << "6 threads cost time: " << diff.count() << endl;

    startTime = chrono::high_resolution_clock::now();
    #pragma omp parallel for num_threads(8)
    for (int i = 0; i < 100000; i++) {
        test();
    }
    endTime = chrono::high_resolution_clock::now();
    diff = endTime - startTime;
    cout << "8 threads cost time: " << diff.count() << endl;

    startTime = chrono::high_resolution_clock::now();
    #pragma omp parallel for num_threads(10)
    for (int i = 0; i < 100000; i++) {
        test();
    }
    endTime = chrono::high_resolution_clock::now();
    diff = endTime - startTime;
    cout << "10 threads cost time: " << diff.count() << endl;

    startTime = chrono::high_resolution_clock::now();
    #pragma omp parallel for num_threads(12)
    for (int i = 0; i < 100000; i++) {
        test();
    }
    endTime = chrono::high_resolution_clock::now();
    diff = endTime - startTime;
    cout << "12 threads cost time: " << diff.count() << endl;

    // system("pause");
    return 0;
}

How I compile the code

clang++ -std=c++11 -Xpreprocessor -fopenmp parallel.cpp -O3 -o parallel -lomp

Aucun commentaire:

Enregistrer un commentaire