I am trying to add two array concurrently with CUDA but I've face some fundamental questions.
The code is:
#include <iostream>
#include <math.h>
#include <iomanip>
#define N (2048*2048)
#define THREADS_PER_BLOCK 512
// CUDA Kernel function to add the elements of two arrays on the GPU
__global__
void add(int size, float *d_in_x, float *d_in_y , float *d_out)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
printf("Hello from block %d, thread %d\n", blockIdx.x, threadIdx.x);
printf("gridDim : %d \n", gridDim.x);
for (int i = index; i < size; i=i+stride)
d_out[i] = d_in_x[i] + d_in_y[i];
//std::cout << "Value for d_out[" << t << "] : " << d_out[t] << "\n";
__syncthreads();
}
int main(void){
/**/
int size = 5;
float *x, *y,*out;
float *d_in_x,*d_in_y, *d_out;// device copies of a, b, c
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, size*sizeof(float));
cudaMallocManaged(&y, size*sizeof(float));
cudaMallocManaged(&out, size*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < size; i++) {
x[i] = 1.1000f;
y[i] = 2.1000f;
out[i] = 0.000f;
}
// Alloc space for device copies
cudaMalloc((void **)&d_in_x, size);
cudaMalloc((void **)&d_in_y, size);
cudaMalloc((void **)&d_out, size);
// Copy to device
cudaMemcpy(d_in_x, x, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_in_y, y, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, size, cudaMemcpyHostToDevice);
/////////////////////////////////////////////////////////////
add<<<1,4>>>(size, d_in_x,d_in_y, d_out);
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);
//std::cout.precision(4);
for (int i = 0; i < size; i++) {
std::cout << "Value for out[" << i << "] : " << out[i] << "\n";
}
std::cout << "\n";
// Free memory
cudaFree(d_in_x);
cudaFree(d_in_y);
cudaFree(d_out);
cudaFree(x);
cudaFree(y);
cudaFree(out);
return 0;
}
When i run it with add<<<1,4>>> i get :
Hello from block 0, thread 0
Hello from block 0, thread 1
Hello from block 0, thread 2
Hello from block 0, thread 3
gridDim : 1
gridDim : 1
gridDim : 1
gridDim : 1
Value for out[0] : 3.2
Value for out[1] : 7.14662e-44
Value for out[2] : 0
Value for out[3] : 0
Value for out[4] : 0
and when i run it with add<<<1,5>>> i get :
Hello from block 0, thread 0
Hello from block 0, thread 1
Hello from block 0, thread 2
Hello from block 0, thread 3
Hello from block 0, thread 4
gridDim : 1
gridDim : 1
gridDim : 1
gridDim : 1
gridDim : 1
Value for out[0] : 3.2
Value for out[1] : 2.87266e-43
Value for out[2] : 0
Value for out[3] : 0
Value for out[4] : 0
My main question is :
1- Why The program zeroed out 40 percent of out array ? shouldn't it add each d_in_x[i]
and d_in_y[i]
and put it in d_out[i]
? why it seems to add first two array elements only? The array elements are the same but why addition result are not?
2 - Is there any way that i uncomment std::cout << "Value for d_out[" << t << "] : " << d_out[t] << "\n";
line inside __global function loop?
Aucun commentaire:
Enregistrer un commentaire