jeudi 15 avril 2021

Why does the linear time implementation I have running the slowest. nth_element vs priority_queue vs algorithm heap functions for topKElements

I am currently working through a couple famous problems in C++ and am playing around with writing them in modern, optimized, C++. I am still fairly new to writing modern C++.

#include <iostream>
#include <vector>
#include <algorithm>
#include <unordered_map>
#include <stdlib.h>
#include <queue>
#include <chrono>
#include <utility>
#include <numeric>
#include <time.h>

using namespace std;

typedef std::chrono::high_resolution_clock::time_point TimeVar;

#define duration(a) std::chrono::duration_cast<std::chrono::nanoseconds>(a).count()
#define timeNow() std::chrono::high_resolution_clock::now()

unordered_map <int, int> m;

vector<int> topKFrequentLinear(int k){    
    vector <int> res;
    res.reserve(m.size());

    for (auto &p : m) res.push_back(p.first);
    
    nth_element(res.begin(), res.begin() + k, res.end(), [](int l, int r){ return m[l] > m[r];});

    // return vector<int> (res.begin(), res.begin() + k);
    
    res.resize(k);
    return res;
}

vector<int> topKFrequentHeap(int k) {    
    priority_queue<pair<int, int>, vector<pair<int, int>>, greater<pair<int, int>>> q;
    
    for(auto & it: m){
        if(q.size() == k){
            if (q.top().first < it.second){
                q.pop();
                q.emplace(it.second, it.first);
            }
        } else {
            q.emplace(it.second, it.first);
        }
    }
    vector<int> f;
    f.reserve(k);
    
    while (!q.empty()) {
        f.emplace_back(q.top().second);
        q.pop();
    }
    
    return f;
    
}

vector<int> topKFrequentHeapManual(int k) {    
    vector<pair<int, int>> q;
    q.reserve(k);
    
    for(auto & it: m){
        if(q.size() == k){
            if (q.front().first < it.second){
                pop_heap(q.begin(),q.end());
                q.pop_back();
            }
        }
        q.emplace_back(it.second, it.first);
        push_heap(q.begin(), q.end());
    }
    
    vector <int> v;
    v.reserve(k);
    
    while (!q.empty() && v.size() < k) {
        v.emplace_back(q.front().second);
        pop_heap(q.begin(),q.end());
        q.pop_back();
    }
    return v;
}

int main()
{
    vector<int> input (10000000, 0);
    srand(time(NULL));

    for(auto & i : input){
        i = rand() % 5000000;
    }

    for(auto & n : input) ++m[n];
    
    int times = 1;
    int k = 1000000;
    TimeVar t1;
    long double linear, heapApproach, manualHeapApproach;

    t1 = timeNow();
    topKFrequentLinear(k);
    linear = duration(timeNow()-t1);
    
    cout << "first " << linear << endl;

    t1 = timeNow();
    topKFrequentHeap(k);
    heapApproach = duration(timeNow()-t1);
    cout << "second " << heapApproach << endl;


    t1 = timeNow();
    topKFrequentHeapManual(k);
    manualHeapApproach = duration(timeNow()-t1);

    cout << "third " << manualHeapApproach << endl;

    return 0;
}

compile with g++ -std=c++17 -O2 test.cpp

When I run the output is

first 1.69238e+09
second 6.42668e+08
third 5.70223e+08

My question is why is the first, linear time approach taking so much longer than the heap approaches?

Aucun commentaire:

Enregistrer un commentaire