lundi 30 mai 2016

Why does this class member variable not change when calling a CUDA kernel function?

In a simple test CUDA application, I have a pointer pointing to a list of class instances, and I copy that data to the GPU. I then run a kernel function many times. The kernel function then calls a __device__ member function for each class instance which increments a variable, profitLoss.

For some reason, profitLoss is not incrementing. Here is the code I have:

#include <stdio.h>
#include <stdlib.h>

#define N 200000

class Strategy {
    private:
        double profitLoss;

    public:
        __device__ __host__ Strategy() {
            this->profitLoss = 0;
        }
        __device__ __host__ void backtest() {
            this->profitLoss++;
        }
        __device__ __host__ double getProfitLoss() {
            return this->profitLoss;
        }
};

__global__ void backtestStrategies(Strategy *strategies) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) {
        strategies[i].backtest();
    }
}

int main() {
    int threadsPerBlock = 1024;
    int blockCount = 32;

    Strategy *devStrategies;
    Strategy *strategies = (Strategy*)malloc(N * sizeof(Strategy));
    double *data = (double*)malloc(1000 * sizeof(double));
    double *devData;
    int i = 0;

    cudaSetDevice(0);

    // Allocate memory for strategies on the GPU.
    cudaMalloc((void**)&devStrategies, N * sizeof(Strategy));
    cudaMalloc((void**)&devData, 1000 * sizeof(double));

    // Initialize strategies on host.
    for (i=0; i<N; i++) {
        strategies[i] = Strategy();
    }

    // Copy strategies from host to GPU.
    cudaMemcpy(devStrategies, strategies, N * sizeof(Strategy), cudaMemcpyHostToDevice);

    for (i=0; i<363598; i++) {
        backtestStrategies<<<blockCount, threadsPerBlock>>>(devStrategies);
    }

    // Copy strategies from the GPU.
    cudaMemcpy(strategies, devStrategies, N * sizeof(Strategy), cudaMemcpyDeviceToHost);
    cudaMemcpy(data, devData, 1000 * sizeof(double), cudaMemcpyDeviceToHost);

    // Display results.
    for (i=0; i<N; i++) {
        printf("%f\n", strategies[i].getProfitLoss());
    }

    // Free memory for the strategies on the GPU.
    cudaFree(devStrategies);

    return 0;
}

The output is as follows:

0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
...

I would expect it to be:

363597.000000
363597.000000
363597.000000
363597.000000
363597.000000
363597.000000
363597.000000
363597.000000
...

I believe profitLoss is not incrementing due to the way I have initialized the objects (automatic storage duration), and I'm not sure of a better way to instantiate these objects and cudaMemcpy them over to the GPU:

strategies[i] = Strategy();

Can anyone offer any suggestions on how to fix this issue or what might be the cause? Thank you in advance!

Aucun commentaire:

Enregistrer un commentaire