I am trying to write some piece of code and convince myself that pass by value, pass by reference(rvalue
and lvalue
reference) should have significant impact on performance (related question). And later I came up with this code below and I thought the performance differences should be visible.
#include <iostream>
#include <vector>
#include <chrono>
#define DurationTy std::chrono::duration_cast<std::chrono::milliseconds>
typedef std::vector<int> VectTy;
size_t const MAX = 10000u;
size_t const NUM = MAX / 10;
int randomize(int mod) { return std::rand() % mod; }
VectTy factory(size_t size, bool pos) {
VectTy vect;
if (pos) {
for (size_t i = 0u; i < size; i++) {
// vect.push_back(randomize(size));
vect.push_back(i);
}
} else {
for (size_t i = 0u; i < size * 2; i++) {
vect.push_back(i);
// vect.push_back(randomize(size));
}
}
return vect;
}
long d1(VectTy vect) {
long sum = 0;
for (auto& v : vect) sum += v;
return sum;
}
long d2(VectTy& vect) {
long sum = 0;
for (auto& v : vect) sum += v;
return sum;
}
long d3(VectTy&& vect) {
long sum = 0;
for (auto& v : vect) sum += v;
return sum;
}
int main(void) {
{
auto start = std::chrono::steady_clock::now();
long total = 0;
for (size_t i = 0; i < NUM; ++i) {
total += d1(factory(MAX, i % 2)); // T1
}
auto end = std::chrono::steady_clock::now();
std::cout << total << std::endl;
auto elapsed = DurationTy(end - start);
std::cerr << elapsed.count() << std::endl;
}
{
auto start = std::chrono::steady_clock::now();
long total = 0;
for (size_t i = 0; i < NUM; ++i) {
VectTy vect = factory(MAX, i % 2); // T2
total += d1(vect);
}
auto end = std::chrono::steady_clock::now();
std::cout << total << std::endl;
auto elapsed = DurationTy(end - start);
std::cerr << elapsed.count() << std::endl;
}
{
auto start = std::chrono::steady_clock::now();
long total = 0;
for (size_t i = 0; i < NUM; ++i) {
VectTy vect = factory(MAX, i % 2); // T3
total += d2(vect);
}
auto end = std::chrono::steady_clock::now();
std::cout << total << std::endl;
auto elapsed = DurationTy(end - start);
std::cerr << elapsed.count() << std::endl;
}
{
auto start = std::chrono::steady_clock::now();
long total = 0;
for (size_t i = 0; i < NUM; ++i) {
total += d3(factory(MAX, i % 2)); // T4
}
auto end = std::chrono::steady_clock::now();
std::cout << total << std::endl;
auto elapsed = DurationTy(end - start);
std::cerr << elapsed.count() << std::endl;
}
return 0;
}
I tested it on both gcc
(4.9.2) and clang
(trunk) with -std=c++11
option. However I found that only when compiling with clang T2
takes more time (for one run, in seconds, 755,924,752,750). And I also compiled the -fno-elide-constructors
version but with similar results.
My questions:
- What are the optimizations applied that bridge the potential performance gaps between
T1
,T2
,T3
in theory? (You can see that I also tried to avoid RVO infactory
.) - What is the possible optimization applied for
T2
by gcc in this case?
Aucun commentaire:
Enregistrer un commentaire