I have made a cuda code for batch processing. The code should work as shown in the images below.
The first one is input and the second one is output from cuda code. The input data shows batch=1, channels=3, width=2 and height=3.
It works well when b=1. Memory check (cuda-memcheck ./ResizeAreaKernel) also no errors.
When I increase batch=2, output is still ok, but memory check has some errors.
When batch size is increased to 8, batchchannelwidth*height > blockSize, there is error in output.
What could be wrong with cuda code?
#include <iostream>
#include <fstream>
using namespace std;
void printdata(float *ptr, int size, const char* name, int stride, int p_stride,int b_stride)
{
ofstream myfile;
myfile.open (name);
for(int i=0; i < size; i++){
if(i % b_stride == 0 && i!=0)
myfile << "\n**********************************************";
if(i % stride == 0 && i!=0)
myfile << "\n";
if(i % p_stride == 0 && i!=0)
myfile << "\n\n";
myfile << *(ptr+i) << ",";
}
myfile.close();
return;
}
void printdataout(float *ptr, int size, const char* name, int stride, int b_stride)
{
ofstream myfile;
myfile.open (name);
for(int i=0; i < size; i++){
if(i % b_stride == 0 && i!=0)
myfile << "\n********************************************";
if(i % stride == 0 && i!=0)
myfile << "\n";
myfile << *(ptr+i) << ",";
}
myfile.close();
return;
}
template <unsigned nthdsPerCTA>
__launch_bounds__(nthdsPerCTA)
__global__ void ResizeAreaKernel(const float *input, float *output, int upscale, int w, int h, int total, int wxh, int c, int bsize)
{
for (int i = blockIdx.x * nthdsPerCTA + threadIdx.x; i < total; i += gridDim.x * nthdsPerCTA){
int bin = i/bsize;
int yin = i/w;//get row number
int xin = i%w;//get index in a row
int cin = i/wxh;//get channel
int yout=yin%h;
int cout=cin%c;
int out=xin*upscale*c+yout*c*w*upscale*upscale+bin*c*w*h*upscale*upscale;
for(int y = 0; y < upscale; y++){//copied for another three successive rows
for(int x = out; x < (out+(upscale*c)); x=x+c){//output is copied 3 times in a row for successive columns
output[x+(y*upscale*w*c)+cout] = input[i];//when one thread comes in for one input pixel
}
}
}
return;
}
int ResizeAreaInference(const void* inputs, void* outputs, int upscale, int w, int h, int c, int b)
{
int n = (int)(w*h*c*b);
const int blockSize = 128;
const int gridSize = (n + blockSize - 1) / blockSize;
ResizeAreaKernel<blockSize><<<gridSize, blockSize, 0>>>(static_cast<const float*>(inputs), static_cast<float*>(outputs), upscale, w, h, n,w*h,c,w*h*c);
cudaDeviceSynchronize();
return 0;
}
int main(void)
{
int w = 2;
int h = 3;
int c = 3;
int batch = 2;
int upscale = 4;
float *in, *out;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&in, h*w*c*sizeof(float));
cudaMallocManaged(&out, h*upscale*w*upscale*c*sizeof(float));
// initialize x on the host
for (int i = 0; i < h*w*c*batch; i++) {
in[i] = rand() % 5;// v1 in the range 0 to 4
}
printdata(in, w*h*c*batch, "input.txt",w, w*h, w*h*c);
ResizeAreaInference(in, out, upscale, w, h, c, batch);
// Check for errors (all values should be 3.0f)
printdataout(out, w*upscale*h*upscale*c*batch, "output.txt", w*upscale*c,w*upscale*upscale*c*h);
// Free memory
cudaFree(in);
cudaFree(out);
return 0;
}


Aucun commentaire:
Enregistrer un commentaire