mercredi 11 septembre 2019

Cuda code for batch processing of 4D array (batch, channel, width, height) issue

I have made a cuda code for batch processing. The code should work as shown in the images below.

enter image description here enter image description here

The first one is input and the second one is output from cuda code. The input data shows batch=1, channels=3, width=2 and height=3.

It works well when b=1. Memory check (cuda-memcheck ./ResizeAreaKernel) also no errors.

When I increase batch=2, output is still ok, but memory check has some errors.

When batch size is increased to 8, batchchannelwidth*height > blockSize, there is error in output.

What could be wrong with cuda code?

#include <iostream>
#include <fstream>
using namespace std;
void printdata(float *ptr, int size, const char* name, int stride, int p_stride,int b_stride)
{
   ofstream myfile;
   myfile.open (name);
   for(int i=0; i < size; i++){
      if(i % b_stride == 0 && i!=0)
         myfile << "\n**********************************************";
      if(i % stride == 0 && i!=0)
         myfile << "\n";
      if(i % p_stride == 0 && i!=0)
         myfile << "\n\n";
      myfile << *(ptr+i) << ",";


   }
   myfile.close();
   return;
}

void printdataout(float *ptr, int size, const char* name, int stride, int b_stride)
{
   ofstream myfile;
   myfile.open (name);
   for(int i=0; i < size; i++){
      if(i % b_stride == 0 && i!=0)
         myfile << "\n********************************************";
      if(i % stride == 0 && i!=0)
         myfile << "\n";
      myfile << *(ptr+i) << ",";


   }
   myfile.close();
   return;
}

template <unsigned nthdsPerCTA>
__launch_bounds__(nthdsPerCTA)
__global__ void ResizeAreaKernel(const float *input, float *output, int upscale, int w, int h, int total, int wxh, int c, int bsize) 
{
    for (int i = blockIdx.x * nthdsPerCTA + threadIdx.x; i < total; i += gridDim.x * nthdsPerCTA){
       int bin = i/bsize;
       int yin = i/w;//get row number
       int xin = i%w;//get index in a row
       int cin = i/wxh;//get channel 

       int yout=yin%h;
       int cout=cin%c;
       int out=xin*upscale*c+yout*c*w*upscale*upscale+bin*c*w*h*upscale*upscale;
       for(int y = 0; y < upscale; y++){//copied for another three successive rows  
          for(int x = out; x < (out+(upscale*c)); x=x+c){//output is copied 3 times in a row for successive columns
            output[x+(y*upscale*w*c)+cout] = input[i];//when one thread comes in for one input pixel
          } 
       }       

    }   

    return;
}


int ResizeAreaInference(const void* inputs, void* outputs, int upscale, int w, int h, int c, int b)
{    
    int n = (int)(w*h*c*b);
    const int blockSize = 128;
    const int gridSize = (n + blockSize - 1) / blockSize; 
    ResizeAreaKernel<blockSize><<<gridSize, blockSize, 0>>>(static_cast<const float*>(inputs), static_cast<float*>(outputs), upscale, w, h, n,w*h,c,w*h*c);
    cudaDeviceSynchronize();
    return 0;
}
int main(void)
{
   int w = 2;
   int h = 3;
   int c = 3;
   int batch = 2;
   int upscale = 4;
   float *in, *out;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&in, h*w*c*sizeof(float));
  cudaMallocManaged(&out, h*upscale*w*upscale*c*sizeof(float));

  // initialize x on the host
  for (int i = 0; i < h*w*c*batch; i++) {
    in[i] = rand() % 5;// v1 in the range 0 to 4
  }
  printdata(in, w*h*c*batch, "input.txt",w, w*h, w*h*c);
  ResizeAreaInference(in, out, upscale, w, h, c, batch);

  // Check for errors (all values should be 3.0f)
  printdataout(out, w*upscale*h*upscale*c*batch, "output.txt", w*upscale*c,w*upscale*upscale*c*h);
  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

Aucun commentaire:

Enregistrer un commentaire