lundi 29 mars 2021

What will decrease the performance of multithread except lock and the cost of thread's create and destory?

I wrote a program which use std::thread::hardware_concurrency to get how much threads my computer could support.Then I divide the size of array by N and get N blocks. And I create N threads to calculate the sum of the block.Here is the code

#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <numeric>
#include <thread>
#include <vector>
#include <stdlib.h>

int64_t thread_cost_time = 0;



template <typename Iterator, typename T> struct accumulate_block {
  void operator()(Iterator first, Iterator last, T &result) {
    using namespace std::chrono;
    auto start = std::chrono::high_resolution_clock::now();
    result = std::accumulate(first, last, result);
    
    auto stop = std::chrono::high_resolution_clock::now();
    auto thread_time =
        std::chrono::duration_cast<microseconds>(stop - start).count();
    thread_cost_time = std::max(thread_time, thread_cost_time);
  }
};

template <typename Iterator, typename T>
T parallel_accumulate(Iterator first, Iterator last, T &init, uint64_t num) {

  uint64_t length = std::distance(first, last);
  const uint64_t min_per_thread = 25;

  // it will assign 12 to hard_ware_threads in my pc
  const uint64_t hardware_threads = std::thread::hardware_concurrency();
  const uint64_t max_threads = (length + min_per_thread - 1) / (min_per_thread);

  // const uint64_t  num_threads = std::min(hardware_threads != 0 ?
  // hardware_threads : 2,
  //                                         max_threads);

  const uint64_t num_threads = num;

  const uint64_t block_size = length / num_threads;

  std::vector<T> results(num_threads);
  std::vector<std::thread> threads(num_threads - 1);
  Iterator block_start = first;
  for (uint64_t i = 0; i < num_threads - 1; i++) {
    Iterator block_end = block_start;
    std::advance(block_end, block_size);

    // calculate the sum of block
    threads[i] = std::thread{accumulate_block<Iterator, T>(), block_start,
                             block_end, std::ref(results[i])};
    block_start = block_end;
  }
  accumulate_block<Iterator, T>()(block_start, last, results[num_threads - 1]);
  std::for_each(threads.begin(), threads.end(),
                std::mem_fn(&std::thread::join));

  return std::accumulate(results.begin(), results.end(), init);
}




int main(int argc, char *argv[]) {

  //    constexpr const uint64_t sz = 1000000000;
  for (int number = 2; number < 32; number++) {
    int64_t parr = 0;
    int64_t single = 0;
    int64_t thread_trivial = 0;
    std::cout
        << "--------------------------------------------------------------"
        << std::endl;
    std::cout << "---------------------thread: " << number
              << "-----------------------" << std::endl;
    int iter_times = 10;
    for (int iter = 0; iter < iter_times; iter++) {
      thread_cost_time = 0;
      constexpr const uint64_t sz = 100000000 ;
      std::vector<uint64_t> arr;
      for (uint32_t i = 0; i < sz; i++) {
        arr.emplace_back(i);
      }

      using namespace std::chrono;

      auto start = std::chrono::high_resolution_clock::now();
      uint64_t init = 0;
      parallel_accumulate<decltype(arr.begin()), uint64_t>(
                       arr.begin(), arr.end(), std::ref(init), number);
      auto stop = std::chrono::high_resolution_clock::now();

      parr += std::chrono::duration_cast<microseconds>(stop - start).count();

      thread_trivial +=
          std::chrono::duration_cast<microseconds>(stop - start).count() -
          thread_cost_time;
      uint64_t init_ = 0;
      uint64_t arr_sz = arr.size();
      // uint64_t  block_sz = arr.size() / 2;
      start = std::chrono::high_resolution_clock::now();
      std::accumulate(arr.begin(), arr.end(), init_);
      // std::cout << init_ << std::endl;
      stop = std::chrono::high_resolution_clock::now();
      single += std::chrono::duration_cast<microseconds>(stop - start).count();

    }
    std::cout << "parallel " << parr / iter_times<< std::endl;
    std::cout << "single thread " << single / iter_times<< std::endl;
    std::cout << "parr is "
              << static_cast<double>(single) / static_cast<double>(parr)
              << "X fast" << std::endl;
    std::cout << "thread create and destory time " << thread_trivial / iter_times
              << std::endl;
  }
}

I record the time of multithread and single thread.

I can only achieve at most 6.57x faster than use only one thread, even though std::thread::hardware_concurrency tell me I have 12 threads could run simultaneously.

There were no contention of lock in this program.I also record the time of create and destory the thread, even if I minus it , I still cannot achieve 12X faster.

I think maybe thread schedule will make multithreads slow, but I have 12 threads, It shouldn't achieve only 6.57x faster. I think maybe multithreads will decrease the hit ratio of cache,but I'm not quite sure. So how can I achieve 12X faster than use only one thread?

Here is my static of my program

threads parallel single faster
2 324868 633777 1.95
3 218584 633777 2.87
4 167169 633777 3.77
5 136542 633777 4.64
6 113207 633777 5.48
7 147324 633777 4.27
8 136768 633777 4.67

You could run my code to get the data from 2 threads to 31 threads

Aucun commentaire:

Enregistrer un commentaire