lundi 21 octobre 2019

Memory error when capturing variable in CUDA extended lambda

I am creating an extended (i.e. __device__) lambda in CUDA (see e.g. here) and it is supposed to capture a variable (here, a simple double value = 3;). It compiles, but running it, I get an invalid memory access error and I don't understand why.

Changing the variable to static const double value = 3 fixes the problem, as it is no longer captured (though I don't understand how it is still available inside the lambda).

Question1: how can I correctly capture host variables in a CUDA extended lambda?

Question2: why is this code not working?

I tried this on Ubuntu 16, both with CUDA 8 and 10.

MWE Code

Compiled with nvcc mwe_lambda.cu -o mwe_lambda --std=c++11 -lineinfo -arch=sm_60 --expt-relaxed-constexpr --expt-extended-lambda

Note in particular the lambda, which should capture by copy. The managed_allocator etc. are just in order to use managed memory and print the CUDA error.

#include <cuda.h>
#include <cuda_runtime.h>

#include <vector>
#include <iostream>
#include <string>


static void CudaHandleError( cudaError_t err, const char *file, int line, const std::string & function)
{
    if (err != cudaSuccess)
    {
        std::cerr << std::string(cudaGetErrorString( err )) << " " << file << " " << line << " " << function << std::endl;
    }
}

#define CU_HANDLE_ERROR( err ) (CudaHandleError( err, __FILE__, __LINE__, __func__ ))

#define CU_CHECK_ERROR( ) (CudaHandleError( cudaGetLastError(), __FILE__, __LINE__, __func__ ))

#define CU_CHECK_AND_SYNC( ) CU_CHECK_ERROR(); CU_HANDLE_ERROR( cudaDeviceSynchronize() )


template<class T>
class managed_allocator : public std::allocator<T>
{
public:
    using value_type = T;

    template<typename _Tp1>
    struct rebind
    {
        typedef managed_allocator<_Tp1> other;
    };

    value_type* allocate(size_t n)
    {
        value_type* result = nullptr;

        CU_HANDLE_ERROR( cudaMallocManaged(&result, n*sizeof(value_type)) );

        return result;
    }

    void deallocate(value_type* ptr, size_t)
    {
        CU_HANDLE_ERROR( cudaFree(ptr) );
    }

    managed_allocator() throw(): std::allocator<T>() { } //fprintf(stderr, "Hello managed allocator!\n"); }
    managed_allocator(const managed_allocator &a) throw(): std::allocator<T>(a) { }
    template <class U>                    
    managed_allocator(const managed_allocator<U> &a) throw(): std::allocator<T>(a) { }
    ~managed_allocator() throw() { }
};

template<typename T>
using field = std::vector<T, managed_allocator<T>>;

// vf[i] = f()
template<typename A, typename F>
__global__ void cu_set_lambda(A * vf, const F & f, int N)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < N)
    {
        vf[idx] = f();
    }
}

int main()
{
    std::cerr << "started" << std::endl;
    {
        field<double> vf(10, 0);

        double value = 3;
        auto lambda = [=] __device__ ()
        {
            return value;
        };

        auto n = vf.size();
        cu_set_lambda<<<(n+1023)/1024, 1024>>>(vf.data(), lambda, n);
        CU_CHECK_AND_SYNC();

        std::cerr << vf[0] << " " << vf[1] << std::endl;
    }
    std::cerr << "finished" << std::endl;
}

Aucun commentaire:

Enregistrer un commentaire