mercredi 26 septembre 2018

Why CUDA return diffrent result when changing dimentions?

I am trying to add two array concurrently with CUDA but I've face some fundamental questions.

The code is:

#include <iostream>
#include <math.h>
#include <iomanip>


#define N (2048*2048)
#define THREADS_PER_BLOCK 512

// CUDA Kernel function to add the elements of two arrays on the GPU
__global__ 
void add(int size, float *d_in_x, float *d_in_y , float *d_out)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    printf("Hello from block %d, thread %d\n", blockIdx.x, threadIdx.x);

    printf("gridDim : %d \n", gridDim.x);

    for (int i = index; i < size; i=i+stride)

        d_out[i] = d_in_x[i] + d_in_y[i];
        //std::cout << "Value for d_out[" << t << "] : " << d_out[t] << "\n";

    __syncthreads();
}


int main(void){

/**/
    int size = 5;
    float *x, *y,*out;
    float *d_in_x,*d_in_y, *d_out;// device copies of a, b, c

    // Allocate Unified Memory – accessible from CPU or GPU
    cudaMallocManaged(&x, size*sizeof(float));
    cudaMallocManaged(&y, size*sizeof(float));
    cudaMallocManaged(&out, size*sizeof(float));

    // initialize x and y arrays on the host
    for (int i = 0; i < size; i++) {
        x[i] = 1.1000f;
        y[i] = 2.1000f;
        out[i] = 0.000f;
    }

    // Alloc space for device copies
    cudaMalloc((void **)&d_in_x, size);
    cudaMalloc((void **)&d_in_y, size);
    cudaMalloc((void **)&d_out, size);

    // Copy to device
    cudaMemcpy(d_in_x, x, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_in_y, y, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, size, cudaMemcpyHostToDevice);

/////////////////////////////////////////////////////////////
    add<<<1,4>>>(size, d_in_x,d_in_y, d_out);

    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);


    //std::cout.precision(4);

    for (int i = 0; i < size; i++) {
        std::cout << "Value for out[" << i << "] : " << out[i] << "\n";
    }

    std::cout << "\n";

    // Free memory
    cudaFree(d_in_x);
    cudaFree(d_in_y);
    cudaFree(d_out);

    cudaFree(x);
    cudaFree(y);
    cudaFree(out);

    return 0;
}

When i run it with add<<<1,4>>> i get :

Hello from block 0, thread 0
Hello from block 0, thread 1
Hello from block 0, thread 2
Hello from block 0, thread 3
gridDim : 1 
gridDim : 1 
gridDim : 1 
gridDim : 1 
Value for out[0] : 3.2
Value for out[1] : 7.14662e-44
Value for out[2] : 0
Value for out[3] : 0
Value for out[4] : 0

and when i run it with add<<<1,5>>> i get :

Hello from block 0, thread 0
Hello from block 0, thread 1
Hello from block 0, thread 2
Hello from block 0, thread 3
Hello from block 0, thread 4
gridDim : 1 
gridDim : 1 
gridDim : 1 
gridDim : 1 
gridDim : 1 
Value for out[0] : 3.2
Value for out[1] : 2.87266e-43
Value for out[2] : 0
Value for out[3] : 0
Value for out[4] : 0

My main question is :

1- Why The program zeroed out 40 percent of out array ? shouldn't it add each d_in_x[i] and d_in_y[i] and put it in d_out[i] ? why it seems to add first two array elements only? The array elements are the same but why addition result are not?

2 - Is there any way that i uncomment std::cout << "Value for d_out[" << t << "] : " << d_out[t] << "\n"; line inside __global function loop?

Aucun commentaire:

Enregistrer un commentaire