vendredi 5 octobre 2018

How to calculate appropriate BLOCK and GRID size and size of shared memory in CUDA?

FIRST of all HELLO to everyone. This is my first question on stackoverflow. I am trying to implement AT*A operation in Cuda.

Algorithm:

  • I have used tile-algorithm matrix multiplication operation, using the shared memory.
  • The code takes only one input source consisting of dimension(row x col)=(A_r x A_c)
  • The data type is complex < double>
  • I am using GPU gtx780

PROBLEM: This program runs properly for small matrices but crashes if u use very large matrices as input.

QUESTION:

  • What values of TILE_WIDTH, Grid size and Block size should i take to run this code for an input matrix of size (11684*48) or larger than that?
  • What changes should i make to decrease computation time in this program?
  • I want to make this code parameterizable so that if i enter matrix dimensions, it automatically calculates the block and grid size.

I am unable to fix the problem in this code. Please, Please help me with this. I Have Been stuck on it since very long

//(At*A) in GPU using shared memory 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>
#include <complex>
#include <cuComplex.h>
#include <stdio.h>
#include <ctime>

#include <conio.h>
#include <fstream>
#include <sstream> 

#define A_c (48)
#define A_r (780)
#define C_c (A_c)
#define C_r (A_c)
#define N (A_c*A_r)

#define TILE_WIDTH (16)
using namespace std;

void display_matrix_mem  (std::complex<double> *A, int row, int col);

__global__ void coalescedMultiply(cuDoubleComplex *a, cuDoubleComplex *c){
    __shared__ cuDoubleComplex ds_M[TILE_WIDTH][TILE_WIDTH];
    __shared__ cuDoubleComplex ds_N[TILE_WIDTH][TILE_WIDTH];

    int ty= threadIdx.y; int by=blockIdx.y;
    int tx= threadIdx.x; int bx=blockIdx.x;
    //**************************************
    /*int blockNumInGrid   = blockIdx.x  + gridDim.x*blockIdx.y;
    int threadNumInBlock = threadIdx.x + blockDim.x*threadIdx.y;
    int threadsPerBlock  = blockDim.x * blockDim.y;
    int globalThreadNum = blockNumInGrid*threadsPerBlock + threadNumInBlock;*/
    //**************************************
    cuDoubleComplex Pvalue;
    Pvalue= make_cuDoubleComplex (0.0,0.0);
    // read the matrix tile into shared memory
    int Row1 = by * TILE_WIDTH + ty;
    int Row = by * TILE_WIDTH + tx;
    int  Col = bx * TILE_WIDTH + tx;
    for (int m = 0; m < (TILE_WIDTH*A_c-1)/TILE_WIDTH+1; ++m)
    {
        if ( Row < A_c  &&  ty+TILE_WIDTH*m < A_r)
        {
            ds_M[tx][ty] = a[Row + (m*TILE_WIDTH+ty)*A_c];
      //    printf("globalid= %d, ds_M: %f,Col,%d index: %d \n",globalThreadNum,ds_M[tx][ty].x,Col, (m*TILE_WIDTH+ty));

        }
        else
        {  
            ds_M[tx][ty].x = 0;
            ds_M[tx][ty].y = 0;
        }

        if (Col < A_c && m*TILE_WIDTH+ty < A_r)
        {
            ds_N[ty][tx] = a[Col + A_c*(m*TILE_WIDTH+ty)];
        }
        else
        {
            ds_N[ty][tx].x = 0;
            ds_N[ty][tx].y = 0;

        }
  __syncthreads();


        for (int k = 0; k < TILE_WIDTH; ++k)
        {
            cuDoubleComplex svalue1;
            svalue1= make_cuDoubleComplex (0.0,0.0);
            svalue1 = (cuCmul(cuConj(ds_M[ty][k]), (ds_N[k][tx])));
            //printf("globalid: %d , ds_M: %f, ds_N: %f  \n",globalThreadNum,ds_M[ty][k].x,ds_N[k][tx].x);
            Pvalue = cuCadd(svalue1,Pvalue);
        }
        __syncthreads();
    }
    //}//endif for globalthreadNum

    if (Row1 < C_r && Col < C_c) {
        c[Row1*C_c+Col].x = Pvalue.x;
        c[Row1*C_c+Col].y = Pvalue.y;
    }
}


int main(int argc,char** argv)
{

 int count=1; 
    complex<double> *source  = (complex<double> *)malloc(N * sizeof(complex<double>));
    for (int i = 0; i < N ; i++) {
     *(source+i) = complex<double> (count,count);
     count++;

     }

    for (int i = 0; i < 20 ; i++) {
     cout<< *(source+i) ;
         }

    std::cout<<"press enter to start multiplication procedure"<<std::endl<<std::endl;
    getchar();

    std::complex<double> *matC_colesced = (std::complex<double> *)malloc(C_r * C_c * sizeof(std::complex<double>));

    const int size_source = A_r*A_c;
    const int size_product = A_c*A_c;

    const int bytes_source = size_source * sizeof(std::complex<double>);
    const int bytes_product = size_product * sizeof(std::complex<double>);

    cuDoubleComplex *d_a,*d_c;

    std::cout<<"memory access has stareted"<<std::endl;

    cudaMalloc( (void**)&d_a,bytes_source);
    cudaMalloc( (void**)&d_c,bytes_product);

    if(cudaMemcpy(d_a, source , bytes_source, cudaMemcpyHostToDevice)!=cudaSuccess)
    {
        std::cout<<"Nope"<<std::endl;
        return 0;
    }

    if(cudaMemcpy(d_c, matC_colesced, bytes_product, cudaMemcpyHostToDevice)!=cudaSuccess)
    {
        std::cout<<"Nope"<<std::endl;
        return 0;
    }

        float time;
    cudaError_t err;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    //Kernal Dimensions and call
    dim3 dimGrid_col_shared((A_r-1)/TILE_WIDTH+1, (A_c-1)/TILE_WIDTH+1, 1);
    dim3 dimBlock_col_shared(TILE_WIDTH, TILE_WIDTH, 1);

    printf("number of blocks_shared: %d x %d x %d \n",dimGrid_col_shared.x,dimGrid_col_shared.y,dimGrid_col_shared.z);
    printf("number of Thread_shared: %d x %d x %d \n",dimBlock_col_shared.x,dimBlock_col_shared.y,dimBlock_col_shared.z);

    clock_t t_gpu;
    t_gpu = clock();
    cudaEventRecord(start, 0);

    coalescedMultiply<<<dimGrid_col_shared, dimBlock_col_shared>>>(d_a, d_c);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cout << " \n Cuda Time - Transposed mm: " << time << "ms\n";

    t_gpu=clock()-t_gpu;
    printf (" \n It took me %d clicks (%f seconds) in GPU to perform mult.\n",t_gpu,((float)t_gpu)/CLOCKS_PER_SEC);

    std::cout<<"kernal exited"<<std::endl;

    getchar();

    std::cout<<"mm with colesced shared mem done"<<std::endl;
    //getchar();

    if(cudaMemcpy(matC_colesced, d_c, bytes_product, cudaMemcpyDeviceToHost)!=cudaSuccess)
    {
        std::cout<<"Nope"<<std::endl;
        return 0;
    }

    std::cout<<"coalesced shared Gpu output \n"<<std::endl;
    getchar();
   display_matrix_mem  (matC_colesced, C_r,C_c);

    cudaFree(d_a);
    cudaFree(d_c);

    free(source);
    getchar();
    return 0;
}
void display_matrix_mem  (std::complex<double> *A, int row, int col)
{
    printf("\n\n");
    for (int i = 0; i < row; i++)
    {   std::cout<<"row: "<<i<<std::endl;
        for (int j = 0; j < col; j++)
        {
            std::cout<<*(A + i*col + j)<<"\t";
        }
        printf("\n");
    }
}

Aucun commentaire:

Enregistrer un commentaire