samedi 3 août 2019

Upscaling data in Cuda

Implementing a matrix to be upscaled horizontally and vertically. For example, 3 channel data is upscaled 4 times as shown below.

enter image description here

enter image description here

My cuda code is working for that.

#include <iostream>
#include <fstream>
#include "ResizeAreaKernel.h"
#define DEBUG
using namespace std;
__global__ void ResizeAreaKernel(float *input, float *output, int upscale, int w, int h, int c, int total)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;//there are w*h number of threads
    int stride = blockDim.x * gridDim.x;
    for (int i = tid; i < total; i += stride){
       int rowadd= ((int)(i/w)*c*w*upscale*upscale)-((int)(i/w)*w*c*upscale);//(j*3*5*4*4) - (j*5*3*4)
       for(int y = 0; y < upscale; y++){
          int s=i*c*upscale+rowadd;
          int e=s+upscale*c;
          for(int x = s; x < e; x=x+c){
             for(int c_ = 0; c_ < c; c_++){
                output[x+c_+y*c*w*upscale] = input[i*c+c_];
             } 
          }       
       }
    }
    return;
}


int ResizeAreaInference(float *input, float *output, int upscale, int w, int h, int c)
{  
    int N = w*h*c;    
    const int THREADS_PER_BLOCK = 256;
    const int NUMBLOCKS = (int)((float)(N+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK); 
    ResizeAreaKernel<<<NUMBLOCKS, THREADS_PER_BLOCK>>>(input, output, upscale, w, h, c, N);
    cudaDeviceSynchronize();
    return 0;
}

#ifdef DEBUG
void printdata(float *ptr, int size, const char* name, int stride)
{
   ofstream myfile;
   myfile.open (name);
   for(int i=0; i < size; i++){
      if(i % stride == 0 && i!=0)
         myfile << "\n";
      myfile << *(ptr+i) << ",";


   }
   myfile.close();
   return;
}

int main(void)
{
   int w = 4;
   int h = 3;
   int c = 3;
   int upscale = 4;
   float *in, *out;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&in, h*w*c*sizeof(float));
  cudaMallocManaged(&out, 10*h*upscale*w*upscale*c*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < h*w*c; i++) {
    in[i] = rand() % 100;
  }
  printdata(in, w*h*c, "input.txt",w*c);
  ResizeAreaInference(in, out, upscale, w, h, c);

  // Check for errors (all values should be 3.0f)
  printdata(out, w*upscale*h*upscale*c, "output.txt", w*upscale*c);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

#endif

But I need to take more space in memory for output buffer (now is 10 times)

cudaMallocManaged(&out, 10*h*upscale*w*upscale*c*sizeof(float));

I need h*upscale*w*upscale*c*sizeof(float) this much of memory for output, but if I don't take extra space, I have

Bus error (core dumped)

What could be problem?

Aucun commentaire:

Enregistrer un commentaire