jeudi 29 janvier 2015

threaded matrix multiplication

I'm working on a threaded implementation of matrix multiplication to work with my custom Matrix class, and I'm running into some issues with speedup.


Setting up and calling the operation looks like this:



Matrix<double> left(2,2);
left[0][0] = 1; left[0][1] = 2;
left[1][0] = 3; left[1][1] = 4;

Matrix<double> right(2,2);
right[0][0] = 1; right[0][1] = 0;
right[1][0] = 0; right[1][1] = 1;

Matrix<double> result = left * right;


And here's a simplified version of my class with the implementations necessary to see how the operation is working. I included constructors, copy constructor, assignment operator, and destructor for thoroughness, the last two functions are mainly what I'm focusing on. Also, I'm transposing the right multiplicand to utilize spatial locality, so each location in the output matrix is a dot product of a row of the left multiplicand and a row of the right multiplicand, as compared to a row of the left multiplicand and a column of the right multiplican. Here's the code:



#include <iostream>
#include <thread>
using namespace std;

template <class type>
class Matrix {

public:
// Default Constructor
Matrix() { this->create(); }
// Custom Constructor
Matrix(int r, int c) { this->create(r,c); }
// Copy Constructor
Matrix(const Matrix<type> &m) { this->copy(m); }
// Assignment Operator
Matrix<type> operator=(const Matrix<type> &m);
// Destructor
~Matrix() { this->destroy(); }

// Accessor Functions
int getRows() const { return rows; }
int getCols() const { return cols; }
type* operator[](int i) const { return contents[i]; }

// Matrix Operations
Matrix<type> transpose() const;

// Matrix Multiplication
friend Matrix<type> operator*(const Matrix<type> &m1, const Matrix<type> &m2)
{ Matrix<type> call; return call.multiply(m1,m2); }

private:
// Private Member Functions
void create();
void create(int r, int c);
void copy(const Matrix<type> &m);
void destroy();

// Operator Overloading Functions
Matrix<type> multiply(const Matrix<type> &m1, const Matrix<type> &m2);

// Private Member Variables
int rows;
int cols;
type** contents;

};


// Default Constructor
template <class type>
void Matrix<type>::create() {
rows = 0;
cols = 0;
contents = NULL;
}


// Custom Constructor
template <class type>
void Matrix<type>::create(int r, int c) {
// Set Integer Values
rows = r;
cols = c;

// Allocate Two-Dimensional Data
contents = new type*[r];
for (int i = 0; i < r; i++) {
contents[i] = new type[c];
}

}


// Copy Constructor
template <class type>
void Matrix<type>::copy(const Matrix &m) {
// Create New Matrix From Existing One
this->rows = m.getRows();
this->cols = m.getCols();

// Allocate Two-Dimensional Data
this->contents = new type*[m.getRows()];
for (int i = 0; i < m.getRows(); i++) {
this->contents[i] = new type[m.getCols()];
}

// Copy Over Data
for (int i = 0; i < m.getRows(); i++) {
for (int j = 0; j < m.getCols(); j++) {
(*this)[i][j] = m[i][j];
}
}

}


// Assignment Operator
template <class type>
Matrix<type> Matrix<type>::operator=(const Matrix<type> &m) {
// Overwrite Existing Matrix with Another
// (Allowing for Self-Assignment)
if (this != &m) {
this->destroy();
this->copy(m);
}

return *this;

}


// Destructor
template <class type>
void Matrix<type>::destroy() {
// Frees Allocated Memory
for (int i = 0; i < rows; i++) {
delete[] contents[i];
}
delete[] contents;

}


// Matrix Transpose
template <class type>
Matrix<type> Matrix<type>::transpose() const {

Matrix<type> tran(cols,rows);

for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
tran[j][i] = contents[i][j];
}
}

return tran;

}


// Threaded Matrix Multiplication
template <class type>
void matrixParallel(const Matrix<type>* a, const Matrix<type>* b, int numThreads, int currentThread, Matrix<type>* c) {

for (int i = currentThread; i < a->getRows(); i+=numThreads) {
for (int j = 0; j < b->getRows(); j++) {
type result = 0;
for (int k = 0; k < a->getCols(); k++) {
result += ((*a)[i][k]*(*b)[j][k]);
}
(*c)[i][j] = result;
}
}

}


// Matrix Multiplication
template <class type>
Matrix<type> Matrix<type>::multiply(const Matrix<type> &m1, const Matrix<type> &m2) {
if (m1.getCols() != m2.getRows()) {
cout << "Error: Cannot Multiply Matrices of Dimensions ";
cout << "(" << m1.getRows() << "x" << m1.getCols() << ")*";
cout << "(" << m2.getRows() << "x" << m2.getCols() << ")" << endl;
cout << " (Must be in the form (MxN)*(NxP)" << endl;
return Matrix<type>();
}


// Parallel Method
Matrix<type> m2t = m2.transpose();
Matrix<type> multiply(m1.getRows(), m2.getCols());

int numCPU = thread::hardware_concurrency();
thread* threads = new thread[numCPU];

const Matrix<type>* m1Pointer = &m1;
const Matrix<type>* m2tPointer = &m2t;
Matrix<type>* multiplyPointer = &multiply;

for (int i = 0; i < numCPU; i++) {
threads[i] = thread(matrixParallel<type>, m1Pointer, m2tPointer, numCPU, i, multiplyPointer);
}

for (int i = 0; i < numCPU; i++) {
threads[i].join();
}

delete[] threads;

return multiply;

}


(note, compiling in Cygwin with compiler option -std=c++11)


My friend implementation of operator* is a bit of a workaround, but I found that my two choices with using operators on templated classes were to use a series of forward declarations (that I couldn't get to work, and didn't really like the format of), or to implement the entire operation in the first declaration of the operator; I ended up doing the second in a cleaner way by using a dummy call object and calling another function that would perform the desired operation while still templated under the same type.


I'm not understanding why I'm not seeing a nearly linear speed-up with this approach, as each thread only does a fraction of the work depending on the number of cores available. Here's some benchmark data I've performed on varying sized matrices with this operation:



Single Thread
==========================================

5 Variables
------------------------------------------
100x5 * 5x5: 0.000157889 seconds
1000x5 * 5x5: 0.0010768 seconds
10000x5 * 5x5: 0.010099 seconds
100000x5 * 5x5: 0.112081 seconds
1000000x5 * 5x5: 1.04285 seconds

10 Variables
------------------------------------------
100x10 * 10x10: 0.000224202 seconds
1000x10 * 10x10: 0.00217571 seconds
10000x10 * 10x10: 0.0201944 seconds
100000x10 * 10x10: 0.203912 seconds
1000000x10 * 10x10: 2.04127 seconds

15 Variables
------------------------------------------
100x15 * 15x15: 0.000408143 seconds
1000x15 * 15x15: 0.00398906 seconds
10000x15 * 15x15: 0.0379782 seconds
100000x15 * 15x15: 0.381156 seconds
1000000x15 * 15x15: 3.81325 seconds

20 Variables
------------------------------------------
100x20 * 20x20: 0.000640239 seconds
1000x20 * 20x20: 0.00620069 seconds
10000x20 * 20x20: 0.060218 seconds
100000x20 * 20x20: 0.602554 seconds
1000000x20 * 20x20: 6.00925 seconds


2 Threads
==========================================

5 Variables
------------------------------------------
100x5 * 5x5: 0.000444063 seconds
1000x5 * 5x5: 0.00119759 seconds
10000x5 * 5x5: 0.00975319 seconds
100000x5 * 5x5: 0.09157 seconds
1000000x5 * 5x5: 0.965666 seconds

10 Variables
------------------------------------------
100x10 * 10x10: 0.000593268 seconds
1000x10 * 10x10: 0.00187927 seconds
10000x10 * 10x10: 0.0154861 seconds
100000x10 * 10x10: 0.161186 seconds
1000000x10 * 10x10: 1.5725 seconds

15 Variables
------------------------------------------
100x15 * 15x15: 0.000651292 seconds
1000x15 * 15x15: 0.00425471 seconds
10000x15 * 15x15: 0.0233983 seconds
100000x15 * 15x15: 0.232411 seconds
1000000x15 * 15x15: 2.43293 seconds

20 Variables
------------------------------------------
100x20 * 20x20: 0.000771287 seconds
1000x20 * 20x20: 0.0045547 seconds
10000x20 * 20x20: 0.0342536 seconds
100000x20 * 20x20: 0.381612 seconds
1000000x20 * 20x20: 3.79707 seconds


4 Threads
==========================================

5 Variables
------------------------------------------
100x5 * 5x5: 0.000690369 seconds
1000x5 * 5x5: 0.00120864 seconds
10000x5 * 5x5: 0.00994858 seconds
100000x5 * 5x5: 0.102673 seconds
1000000x5 * 5x5: 0.907731 seconds

10 Variables
------------------------------------------
100x10 * 10x10: 0.000896809 seconds
1000x10 * 10x10: 0.00287674 seconds
10000x10 * 10x10: 0.0177846 seconds
100000x10 * 10x10: 0.161331 seconds
1000000x10 * 10x10: 1.46384 seconds

15 Variables
------------------------------------------
100x15 * 15x15: 0.00100457 seconds
1000x15 * 15x15: 0.00366381 seconds
10000x15 * 15x15: 0.0291613 seconds
100000x15 * 15x15: 0.237525 seconds
1000000x15 * 15x15: 2.23676 seconds

20 Variables
------------------------------------------
100x20 * 20x20: 0.000928781 seconds
1000x20 * 20x20: 0.00486535 seconds
10000x20 * 20x20: 0.0421105 seconds
100000x20 * 20x20: 0.354478 seconds
1000000x20 * 20x20: 3.22576 seconds


Can anyone provide any insight as to what may be causing this? Any comments would be greatly appreciated.


Aucun commentaire:

Enregistrer un commentaire