I would like to construct a distance matrix in parallel in C++11 using OpenMP. I read various documentations, introductions, examples etc. Yet, I still have a few questions. To facilitate answering this post, I state my questions as assumptions numbered 1 through 7. This way, you can quickly browse through them and point out which ones are correct and which ones are not.
Let us begin with a simple serially executed function computing a dense Armadillo matrix:
// [[Rcpp::export]]
arma::mat compute_dist_mat(arma::mat &coordinates, unsigned int n_points) {
arma::mat dist_mat(n_points, n_points, arma::fill::zeros);
double dist {};
for(unsigned int i {0}; i < n_points; i++) {
for(unsigned int j = i + 1; j < n_points; j++) {
dist = compute_dist(coordinates(i, 1), coordinates(j, 1), coordinates(i, 0), coordinates(j, 0));
dist_mat.at(i, j) = dist;
dist_mat.at(j, i) = dist;
}
}
return dist_mat;
}
As a side note: this function is supposed to be called from R through the Rcpp interface - indicated by the // [[Rcpp::export]]
. And accordingly the top of the file includes
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::plugins(cpp11)]]
#include <omp.h>
// [[Rcpp::plugins(openmp)]]
using namespace Rcpp;
using namespace arma;
However, the function should work also fine without the R interface.
In an attempt to parallelize the code, I replace the loops with
unsigned int i {};
unsigned int j {};
# pragma omp parallel for private(dist, i, j) num_threads(n_threads) if(n_threads > 1)
for(i = 0; i < n_points; i++) {
for(j = i + 1; j < n_points; j++) {
dist = compute_dist(coordinates(i, 1), coordinates(j, 1), coordinates(i, 0), coordinates(j, 0));
dist_mat.at(i, j) = dist;
dist_mat.at(j, i) = dist;
}
}
and add n_threads
as an argument to the compute_dist_mat
function.
- This distributes the iterations of the outer loop across threads, with the iterations of the inner loop executed by the respective thread handling the outer loop.
- The two loop levels cannot be combined because the inner loop depends on the outer one.
dist
,i
, andj
are all to be initialized above the# pragma
line and then declared private rather than initializing them in the loops.- The
# pragma
line does not have any effect whenn_treads = 1
, inducing a serial execution.
Extending the dense matrix application, the following code block illustrates the serial sparse matrix case with batch insertion. To motivate the use of sparse matrices here, I set distances below a certain threshold to zero.
// [[Rcpp::export]]
arma::sp_mat compute_dist_spmat(arma::mat &coordinates, unsigned int n_points, double dist_threshold) {
std::vector<double> dists;
std::vector<unsigned int> dist_i;
std::vector<unsigned int> dist_j;
double dist {};
for(unsigned long int i {0}; i < n_points; i++) {
for(unsigned long int j = i + 1; j < n_points; j++) {
dist = compute_dist(coordinates(i, 1), coordinates(j, 1), coordinates(i, 0), coordinates(j, 0));
if(dist >= dist_threshold) {
dists.push_back(dist);
dist_i.push_back(i);
dist_j.push_back(j);
}
}
}
unsigned int mat_size = dist_i.size();
arma::umat index_mat(2, mat_size * 2);
arma::vec dists_vec(mat_size * 2);
unsigned int j {};
for(unsigned int i {0}; i < mat_size; i++) {
j = i * 2;
index_mat.at(0, j) = dist_i[i];
index_mat.at(1, j) = dist_j[i];
index_mat.at(0, j + 1) = dist_j[i];
index_mat.at(1, j + 1) = dist_i[i];
dists_vec.at(j) = dists[i];
dists_vec.at(j + 1) = dists[i];
}
arma::sp_mat dist_mat(index_mat, values_vec, n_points, n_points);
return dist_mat;
}
Because the function does ex ante not know how many distances are above the threshold, it first stores the non-zero values in standard vectors and then constructs the Armadillo objects from them.
I parallelize the function as follows:
// [[Rcpp::export]]
arma::sp_mat compute_dist_spmat(arma::mat &coordinates, unsigned int n_points, double dist_threshold, unsigned short int n_threads) {
std::vector<std::vector<double>> dists(n_points);
std::vector<std::vector<unsigned int>> dist_j(n_points);
double dist {};
unsigned int i {};
unsigned int j {};
# pragma omp parallel for private(dist, i, j) num_threads(n_threads) if(n_threads > 1)
for(i = 0; i < n_points; i++) {
for(j = i + 1; j < n_points; j++) {
dist = compute_dist(coordinates(i, 1), coordinates(j, 1), coordinates(i, 0), coordinates(j, 0));
if(dist >= dist_threshold) {
dists[i].push_back(dist);
dist_j[i].push_back(j);
}
}
}
unsigned int vec_intervals[n_points + 1];
vec_intervals[0] = 0;
for (i = 0; i < n_points; i++) {
vec_intervals[i + 1] = vec_intervals[i] + dist_j[i].size();
}
unsigned int mat_size {vec_intervals[n_points]};
arma::umat index_mat(2, mat_size * 2);
arma::vec dists_vec(mat_size * 2);
unsigned int vec_begins_i {};
unsigned int vec_length_i {};
unsigned int k {};
# pragma omp parallel for private(i, j, k, vec_begins_i, vec_length_i) num_threads(n_threads) if(n_threads > 1)
for(i = 0; i < n_points; i++) {
vec_begins_i = vec_intervals[i];
vec_length_i = vec_intervals[i + 1] - vec_begins_i;
for(j = 0, j < vec_length_i, j++) {
k = (vec_begins_i + j) * 2;
index_mat.at(0, k) = i;
index_mat.at(1, k) = dist_j[i][j];
index_mat.at(0, k + 1) = dist_j[i][j];
index_mat.at(1, k + 1) = i;
dists_vec.at(k) = dists[i][j];
dists_vec.at(k + 1) = dists[i][j];
}
}
arma::sp_mat dist_mat(index_mat, dists_vec, n_points, n_points);
return dist_mat;
}
- Using dynamic vectors in the loop is thread-safe.
dist
,i
,j
,k
,vec_begins_i
, andvec_length_i
are all to be initialized above the# pragma
line and then declared private rather than initializing them in the loops.- Nothing has to be marked as a section.
Are any of the seven statements incorrect?
Aucun commentaire:
Enregistrer un commentaire