I am working on one of my assignments that involves a HashTable. I ran into an issue when reading data from a .csv
file. The file contains Covid-19 data from the WHO website (300K lines+). My original code which read the file took around 2 minutes to read the file so I decided to use threading inorder to speed up the process. I set num_threads
to std::thread::hardware_concurrency();
which is 64 on my laptop (MacBook Pro 2021 M2 chip). After executing the code can still see some delay (6 seconds) which is definetly faster than my first solution.
Is there a way to speed it up even more? Is there a different way to approach this problem?
I don't want to assign a hardcoded value to num_threads because in the end the TA's will be running the program and I don't know what laptop they're using.
Code:
void process_chunk(std::vector<std::string> chunk, CovidDB* db, std::mutex* mtx) {
std::string latest_date_str = "01/01/00"; // initialize to an old date
std::tm latest_date = {};
std::istringstream iss(latest_date_str);
iss >> std::get_time(&latest_date, "%m/%d/%y");
for (auto line : chunk) {
std::stringstream ss(line);
std::string country, date_str, cases_str, deaths_str;
std::getline(ss, date_str, ',');
std::getline(ss, country, ',');
std::getline(ss, cases_str, ',');
std::getline(ss, deaths_str, ',');
int cases = std::stoi(cases_str);
int deaths = std::stoi(deaths_str);
std::tm entry_date = {};
std::istringstream iss2(date_str);
iss2 >> std::get_time(&entry_date, "%m/%d/%y");
if (mktime(&entry_date) > mktime(&latest_date)) {
latest_date_str = date_str;
latest_date = entry_date;
}
DataEntry* entry = new DataEntry();
entry->set_country(country);
entry->set_date(latest_date_str);
entry->set_c_cases(cases);
entry->set_c_deaths(deaths);
std::lock_guard<std::mutex> lock(*mtx);
db->add(entry);
}
}
void CovidDB::add_covid_data(std::string const COVID_FILE) {
std::ifstream file(COVID_FILE);
if (!file) {
std::cout << "\n[File ERROR]\n " << COVID_FILE << std::endl;
std::exit(EXIT_FAILURE);
}
std::string line;
std::getline(file, line); // skip header line
std::string latest_date_str = "01/01/00"; // initialize to an old date
std::tm latest_date = {};
std::istringstream iss(latest_date_str);
iss >> std::get_time(&latest_date, "%m/%d/%y");
const int num_threads = std::thread::hardware_concurrency();
std::vector<std::vector<std::string>> chunks(num_threads);
int i = 0;
while (std::getline(file, line)) {
chunks[i % num_threads].push_back(line);
i++;
}
file.close();
std::vector<std::thread> threads;
std::mutex mtx;
for (auto chunk : chunks) {
threads.emplace_back(process_chunk, chunk, this, &mtx);
}
for (auto& thread : threads) {
thread.join();
}
}
MakeFile:
CXX = g++
CXXFLAGS = -std=c++11 -pthread -g -Wall -Wextra -Werror -pedantic -Wno-unused-parameter -Wno-return-type -Wno-unused-variable
LDFLAGS = -pthread
all: main
main: CovidDB.o main.o
$(CXX) $(CXXFLAGS) -o $@ $^
CovidDB.o: CovidDB.cpp CovidDB.h
$(CXX) $(CXXFLAGS) -c $<
main.o: main.cpp CovidDB.h
$(CXX) $(CXXFLAGS) -c $<
clean:
rm -f main *.o
I tried increassing the amout of threads, I expected it to run faster but nothing really changed
Aucun commentaire:
Enregistrer un commentaire