FIRST of all HELLO to everyone. This is my first question on stackoverflow. I am trying to implement AT*A operation in Cuda.
Algorithm:
- I have used tile-algorithm matrix multiplication operation, using the shared memory.
- The code takes only one input source consisting of dimension(row x col)=(A_r x A_c)
- The data type is complex < double>
- I am using GPU gtx780
PROBLEM: This program runs properly for small matrices but crashes if u use very large matrices as input.
QUESTION:
- What values of TILE_WIDTH, Grid size and Block size should i take to run this code for an input matrix of size (11684*48) or larger than that?
- What changes should i make to decrease computation time in this program?
- I want to make this code parameterizable so that if i enter matrix dimensions, it automatically calculates the block and grid size.
I am unable to fix the problem in this code. Please, Please help me with this. I Have Been stuck on it since very long
//(At*A) in GPU using shared memory
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>
#include <complex>
#include <cuComplex.h>
#include <stdio.h>
#include <ctime>
#include <conio.h>
#include <fstream>
#include <sstream>
#define A_c (48)
#define A_r (780)
#define C_c (A_c)
#define C_r (A_c)
#define N (A_c*A_r)
#define TILE_WIDTH (16)
using namespace std;
void display_matrix_mem (std::complex<double> *A, int row, int col);
__global__ void coalescedMultiply(cuDoubleComplex *a, cuDoubleComplex *c){
__shared__ cuDoubleComplex ds_M[TILE_WIDTH][TILE_WIDTH];
__shared__ cuDoubleComplex ds_N[TILE_WIDTH][TILE_WIDTH];
int ty= threadIdx.y; int by=blockIdx.y;
int tx= threadIdx.x; int bx=blockIdx.x;
//**************************************
/*int blockNumInGrid = blockIdx.x + gridDim.x*blockIdx.y;
int threadNumInBlock = threadIdx.x + blockDim.x*threadIdx.y;
int threadsPerBlock = blockDim.x * blockDim.y;
int globalThreadNum = blockNumInGrid*threadsPerBlock + threadNumInBlock;*/
//**************************************
cuDoubleComplex Pvalue;
Pvalue= make_cuDoubleComplex (0.0,0.0);
// read the matrix tile into shared memory
int Row1 = by * TILE_WIDTH + ty;
int Row = by * TILE_WIDTH + tx;
int Col = bx * TILE_WIDTH + tx;
for (int m = 0; m < (TILE_WIDTH*A_c-1)/TILE_WIDTH+1; ++m)
{
if ( Row < A_c && ty+TILE_WIDTH*m < A_r)
{
ds_M[tx][ty] = a[Row + (m*TILE_WIDTH+ty)*A_c];
// printf("globalid= %d, ds_M: %f,Col,%d index: %d \n",globalThreadNum,ds_M[tx][ty].x,Col, (m*TILE_WIDTH+ty));
}
else
{
ds_M[tx][ty].x = 0;
ds_M[tx][ty].y = 0;
}
if (Col < A_c && m*TILE_WIDTH+ty < A_r)
{
ds_N[ty][tx] = a[Col + A_c*(m*TILE_WIDTH+ty)];
}
else
{
ds_N[ty][tx].x = 0;
ds_N[ty][tx].y = 0;
}
__syncthreads();
for (int k = 0; k < TILE_WIDTH; ++k)
{
cuDoubleComplex svalue1;
svalue1= make_cuDoubleComplex (0.0,0.0);
svalue1 = (cuCmul(cuConj(ds_M[ty][k]), (ds_N[k][tx])));
//printf("globalid: %d , ds_M: %f, ds_N: %f \n",globalThreadNum,ds_M[ty][k].x,ds_N[k][tx].x);
Pvalue = cuCadd(svalue1,Pvalue);
}
__syncthreads();
}
//}//endif for globalthreadNum
if (Row1 < C_r && Col < C_c) {
c[Row1*C_c+Col].x = Pvalue.x;
c[Row1*C_c+Col].y = Pvalue.y;
}
}
int main(int argc,char** argv)
{
int count=1;
complex<double> *source = (complex<double> *)malloc(N * sizeof(complex<double>));
for (int i = 0; i < N ; i++) {
*(source+i) = complex<double> (count,count);
count++;
}
for (int i = 0; i < 20 ; i++) {
cout<< *(source+i) ;
}
std::cout<<"press enter to start multiplication procedure"<<std::endl<<std::endl;
getchar();
std::complex<double> *matC_colesced = (std::complex<double> *)malloc(C_r * C_c * sizeof(std::complex<double>));
const int size_source = A_r*A_c;
const int size_product = A_c*A_c;
const int bytes_source = size_source * sizeof(std::complex<double>);
const int bytes_product = size_product * sizeof(std::complex<double>);
cuDoubleComplex *d_a,*d_c;
std::cout<<"memory access has stareted"<<std::endl;
cudaMalloc( (void**)&d_a,bytes_source);
cudaMalloc( (void**)&d_c,bytes_product);
if(cudaMemcpy(d_a, source , bytes_source, cudaMemcpyHostToDevice)!=cudaSuccess)
{
std::cout<<"Nope"<<std::endl;
return 0;
}
if(cudaMemcpy(d_c, matC_colesced, bytes_product, cudaMemcpyHostToDevice)!=cudaSuccess)
{
std::cout<<"Nope"<<std::endl;
return 0;
}
float time;
cudaError_t err;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//Kernal Dimensions and call
dim3 dimGrid_col_shared((A_r-1)/TILE_WIDTH+1, (A_c-1)/TILE_WIDTH+1, 1);
dim3 dimBlock_col_shared(TILE_WIDTH, TILE_WIDTH, 1);
printf("number of blocks_shared: %d x %d x %d \n",dimGrid_col_shared.x,dimGrid_col_shared.y,dimGrid_col_shared.z);
printf("number of Thread_shared: %d x %d x %d \n",dimBlock_col_shared.x,dimBlock_col_shared.y,dimBlock_col_shared.z);
clock_t t_gpu;
t_gpu = clock();
cudaEventRecord(start, 0);
coalescedMultiply<<<dimGrid_col_shared, dimBlock_col_shared>>>(d_a, d_c);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << " \n Cuda Time - Transposed mm: " << time << "ms\n";
t_gpu=clock()-t_gpu;
printf (" \n It took me %d clicks (%f seconds) in GPU to perform mult.\n",t_gpu,((float)t_gpu)/CLOCKS_PER_SEC);
std::cout<<"kernal exited"<<std::endl;
getchar();
std::cout<<"mm with colesced shared mem done"<<std::endl;
//getchar();
if(cudaMemcpy(matC_colesced, d_c, bytes_product, cudaMemcpyDeviceToHost)!=cudaSuccess)
{
std::cout<<"Nope"<<std::endl;
return 0;
}
std::cout<<"coalesced shared Gpu output \n"<<std::endl;
getchar();
display_matrix_mem (matC_colesced, C_r,C_c);
cudaFree(d_a);
cudaFree(d_c);
free(source);
getchar();
return 0;
}
void display_matrix_mem (std::complex<double> *A, int row, int col)
{
printf("\n\n");
for (int i = 0; i < row; i++)
{ std::cout<<"row: "<<i<<std::endl;
for (int j = 0; j < col; j++)
{
std::cout<<*(A + i*col + j)<<"\t";
}
printf("\n");
}
}
Aucun commentaire:
Enregistrer un commentaire