I am creating an extended (i.e. __device__
) lambda in CUDA (see e.g. here) and it is supposed to capture a variable (here, a simple double value = 3;
). It compiles, but running it, I get an invalid memory access
error and I don't understand why.
Changing the variable to static const double value = 3
fixes the problem, as it is no longer captured (though I don't understand how it is still available inside the lambda).
Question1: how can I correctly capture host variables in a CUDA extended lambda?
Question2: why is this code not working?
I tried this on Ubuntu 16, both with CUDA 8 and 10.
MWE Code
Compiled with nvcc mwe_lambda.cu -o mwe_lambda --std=c++11 -lineinfo -arch=sm_60 --expt-relaxed-constexpr --expt-extended-lambda
Note in particular the lambda
, which should capture by copy. The managed_allocator
etc. are just in order to use managed memory and print the CUDA error.
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <iostream>
#include <string>
static void CudaHandleError( cudaError_t err, const char *file, int line, const std::string & function)
{
if (err != cudaSuccess)
{
std::cerr << std::string(cudaGetErrorString( err )) << " " << file << " " << line << " " << function << std::endl;
}
}
#define CU_HANDLE_ERROR( err ) (CudaHandleError( err, __FILE__, __LINE__, __func__ ))
#define CU_CHECK_ERROR( ) (CudaHandleError( cudaGetLastError(), __FILE__, __LINE__, __func__ ))
#define CU_CHECK_AND_SYNC( ) CU_CHECK_ERROR(); CU_HANDLE_ERROR( cudaDeviceSynchronize() )
template<class T>
class managed_allocator : public std::allocator<T>
{
public:
using value_type = T;
template<typename _Tp1>
struct rebind
{
typedef managed_allocator<_Tp1> other;
};
value_type* allocate(size_t n)
{
value_type* result = nullptr;
CU_HANDLE_ERROR( cudaMallocManaged(&result, n*sizeof(value_type)) );
return result;
}
void deallocate(value_type* ptr, size_t)
{
CU_HANDLE_ERROR( cudaFree(ptr) );
}
managed_allocator() throw(): std::allocator<T>() { } //fprintf(stderr, "Hello managed allocator!\n"); }
managed_allocator(const managed_allocator &a) throw(): std::allocator<T>(a) { }
template <class U>
managed_allocator(const managed_allocator<U> &a) throw(): std::allocator<T>(a) { }
~managed_allocator() throw() { }
};
template<typename T>
using field = std::vector<T, managed_allocator<T>>;
// vf[i] = f()
template<typename A, typename F>
__global__ void cu_set_lambda(A * vf, const F & f, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < N)
{
vf[idx] = f();
}
}
int main()
{
std::cerr << "started" << std::endl;
{
field<double> vf(10, 0);
double value = 3;
auto lambda = [=] __device__ ()
{
return value;
};
auto n = vf.size();
cu_set_lambda<<<(n+1023)/1024, 1024>>>(vf.data(), lambda, n);
CU_CHECK_AND_SYNC();
std::cerr << vf[0] << " " << vf[1] << std::endl;
}
std::cerr << "finished" << std::endl;
}
Aucun commentaire:
Enregistrer un commentaire