Implementing a matrix to be upscaled horizontally and vertically. For example, 3 channel data is upscaled 4 times as shown below.
My cuda code is working for that.
#include <iostream>
#include <fstream>
#include "ResizeAreaKernel.h"
#define DEBUG
using namespace std;
__global__ void ResizeAreaKernel(float *input, float *output, int upscale, int w, int h, int c, int total)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;//there are w*h number of threads
int stride = blockDim.x * gridDim.x;
for (int i = tid; i < total; i += stride){
int rowadd= ((int)(i/w)*c*w*upscale*upscale)-((int)(i/w)*w*c*upscale);//(j*3*5*4*4) - (j*5*3*4)
for(int y = 0; y < upscale; y++){
int s=i*c*upscale+rowadd;
int e=s+upscale*c;
for(int x = s; x < e; x=x+c){
for(int c_ = 0; c_ < c; c_++){
output[x+c_+y*c*w*upscale] = input[i*c+c_];
}
}
}
}
return;
}
int ResizeAreaInference(float *input, float *output, int upscale, int w, int h, int c)
{
int N = w*h*c;
const int THREADS_PER_BLOCK = 256;
const int NUMBLOCKS = (int)((float)(N+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK);
ResizeAreaKernel<<<NUMBLOCKS, THREADS_PER_BLOCK>>>(input, output, upscale, w, h, c, N);
cudaDeviceSynchronize();
return 0;
}
#ifdef DEBUG
void printdata(float *ptr, int size, const char* name, int stride)
{
ofstream myfile;
myfile.open (name);
for(int i=0; i < size; i++){
if(i % stride == 0 && i!=0)
myfile << "\n";
myfile << *(ptr+i) << ",";
}
myfile.close();
return;
}
int main(void)
{
int w = 4;
int h = 3;
int c = 3;
int upscale = 4;
float *in, *out;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&in, h*w*c*sizeof(float));
cudaMallocManaged(&out, 10*h*upscale*w*upscale*c*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < h*w*c; i++) {
in[i] = rand() % 100;
}
printdata(in, w*h*c, "input.txt",w*c);
ResizeAreaInference(in, out, upscale, w, h, c);
// Check for errors (all values should be 3.0f)
printdata(out, w*upscale*h*upscale*c, "output.txt", w*upscale*c);
// Free memory
cudaFree(in);
cudaFree(out);
return 0;
}
#endif
But I need to take more space in memory for output buffer (now is 10 times)
cudaMallocManaged(&out, 10*h*upscale*w*upscale*c*sizeof(float));
I need h*upscale*w*upscale*c*sizeof(float)
this much of memory for output, but if I don't take extra space, I have
Bus error (core dumped)
What could be problem?
Aucun commentaire:
Enregistrer un commentaire