dimanche 26 avril 2015

Why no significant performance differences for this code with different param passing strategies?

I am trying to write some piece of code and convince myself that pass by value, pass by reference(rvalue and lvalue reference) should have significant impact on performance (related question). And later I came up with this code below and I thought the performance differences should be visible.

#include <iostream>
#include <vector>
#include <chrono>

#define DurationTy std::chrono::duration_cast<std::chrono::milliseconds>
typedef std::vector<int> VectTy;
size_t const MAX = 10000u;
size_t const NUM = MAX / 10;

int randomize(int mod) { return std::rand() % mod; }

VectTy factory(size_t size, bool pos) {
  VectTy vect;
  if (pos) {
    for (size_t i = 0u; i < size; i++) {
      // vect.push_back(randomize(size));
      vect.push_back(i);
    }
  } else {
    for (size_t i = 0u; i < size * 2; i++) {
      vect.push_back(i);
      // vect.push_back(randomize(size));
    }
  }
  return vect;
}

long d1(VectTy vect) {
  long sum = 0;
  for (auto& v : vect) sum += v;
  return sum;
}

long d2(VectTy& vect) {
  long sum = 0;
  for (auto& v : vect) sum += v;
  return sum;
}

long d3(VectTy&& vect) {
  long sum = 0;
  for (auto& v : vect) sum += v;
  return sum;
}

int main(void) {
  {
    auto start = std::chrono::steady_clock::now();
    long total = 0;
    for (size_t i = 0; i < NUM; ++i) {
      total += d1(factory(MAX, i % 2)); // T1
    }
    auto end = std::chrono::steady_clock::now();
    std::cout << total << std::endl;
    auto elapsed = DurationTy(end - start);
    std::cerr << elapsed.count() << std::endl;
  }
  {
    auto start = std::chrono::steady_clock::now();
    long total = 0;
    for (size_t i = 0; i < NUM; ++i) {
      VectTy vect = factory(MAX, i % 2); // T2
      total += d1(vect);
    }
    auto end = std::chrono::steady_clock::now();
    std::cout << total << std::endl;
    auto elapsed = DurationTy(end - start);
    std::cerr << elapsed.count() << std::endl;
  }
  {
    auto start = std::chrono::steady_clock::now();
    long total = 0;
    for (size_t i = 0; i < NUM; ++i) {
      VectTy vect = factory(MAX, i % 2); // T3
      total += d2(vect);
    }
    auto end = std::chrono::steady_clock::now();
    std::cout << total << std::endl;
    auto elapsed = DurationTy(end - start);
    std::cerr << elapsed.count() << std::endl;
  }
  {
    auto start = std::chrono::steady_clock::now();
    long total = 0;
    for (size_t i = 0; i < NUM; ++i) {
      total += d3(factory(MAX, i % 2));  // T4
    }
    auto end = std::chrono::steady_clock::now();
    std::cout << total << std::endl;
    auto elapsed = DurationTy(end - start);
    std::cerr << elapsed.count() << std::endl;
  }
  return 0;
}

I tested it on both gcc(4.9.2) and clang(trunk) with -std=c++11 option. However I found that only when compiling with clang T2 takes more time (for one run, in seconds, 755,924,752,750). And I also compiled the -fno-elide-constructors version but with similar results.

My questions:

  • What are the optimizations applied that bridge the potential performance gaps between T1, T2, T3 in theory? (You can see that I also tried to avoid RVO in factory.)
  • What is the possible optimization applied for T2 by gcc in this case?

Aucun commentaire:

Enregistrer un commentaire