I played a little with the experimental lambdas device , which was introduced in CUDA 7.5 and promoted in this blog post by Mark Harris .
In the following example, I removed a lot of things that are not needed to show my problem (my actual implementation looks a little better ...).
I tried to write a foreach function that works either on vectors on a device (1 thread per element) or on a host (serial) depending on the template parameter. With this foreach function, I can easily implement the BLAS functions. As an example, I use scalar assignment to each component of the vector (at the end I add the complete code):
template<bool onDevice> void assignScalar( size_t size, double* vector, double a ) { auto assign = [=] __host__ __device__ ( size_t index ) { vector[index] = a; }; if( onDevice ) { foreachDevice( size, assign ); } else { foreachHost( size, assign ); } }
However, this code gives a compiler error due to __host__ __device__ lambda:
The closure type for lambda ("lambda → void") cannot be used in the template argument type of the __global__ function template instance, unless the lambda is defined in the __device__ or __global__ function
I get the same error if I __device__ from the lambda expression, and I don't get a compilation error if I __host__ (only __device__ lambda), but in this case the host part fails ...
If I define lambda as __host__ or __device__ separately, the code compiles and works as expected.
template<bool onDevice> void assignScalar2( size_t size, double* vector, double a ) { if( onDevice ) { auto assign = [=] __device__ ( size_t index ) { vector[index] = a; }; foreachDevice( size, assign ); } else { auto assign = [=] __host__ ( size_t index ) { vector[index] = a; }; foreachHost( size, assign ); } }
However, this leads to code duplication and actually makes the whole idea of using lambdas useless for this example.
Is there a way to accomplish what I want to do, or is it an error in an experimental function? In fact, the definition of a __host__ __device__ lambda is explicitly mentioned in the first example in the programming guide . Even for this simpler example (just return a constant value from lambda), I could not find a way to use the lambda expression on the host and device.
Here is the complete code, compile with the options -std=c++11 --expt-extended-lambda :
#include <iostream> using namespace std; template<typename Operation> void foreachHost( size_t size, Operation o ) { for( size_t i = 0; i < size; ++i ) { o( i ); } } template<typename Operation> __global__ void kernel_foreach( Operation o ) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; o( index ); } template<typename Operation> void foreachDevice( size_t size, Operation o ) { size_t blocksize = 32; size_t gridsize = size/32; kernel_foreach<<<gridsize,blocksize>>>( o ); } __global__ void printFirstElementOnDevice( double* vector ) { printf( "dVector[0] = %f\n", vector[0] ); } void assignScalarHost( size_t size, double* vector, double a ) { auto assign = [=] ( size_t index ) { vector[index] = a; }; foreachHost( size, assign ); } void assignScalarDevice( size_t size, double* vector, double a ) { auto assign = [=] __device__ ( size_t index ) { vector[index] = a; }; foreachDevice( size, assign ); } // compile error: template<bool onDevice> void assignScalar( size_t size, double* vector, double a ) { auto assign = [=] __host__ __device__ ( size_t index ) { vector[index] = a; }; if( onDevice ) { foreachDevice( size, assign ); } else { foreachHost( size, assign ); } } // works: template<bool onDevice> void assignScalar2( size_t size, double* vector, double a ) { if( onDevice ) { auto assign = [=] __device__ ( size_t index ) { vector[index] = a; }; foreachDevice( size, assign ); } else { auto assign = [=] __host__ ( size_t index ) { vector[index] = a; }; foreachHost( size, assign ); } } int main() { size_t SIZE = 32; double* hVector = new double[SIZE]; double* dVector; cudaMalloc( &dVector, SIZE*sizeof(double) ); // clear memory for( size_t i = 0; i < SIZE; ++i ) { hVector[i] = 0; } cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice ); assignScalarHost( SIZE, hVector, 1.0 ); cout << "hVector[0] = " << hVector[0] << endl; assignScalarDevice( SIZE, dVector, 2.0 ); printFirstElementOnDevice<<<1,1>>>( dVector ); cudaDeviceSynchronize(); assignScalar2<false>( SIZE, hVector, 3.0 ); cout << "hVector[0] = " << hVector[0] << endl; assignScalar2<true>( SIZE, dVector, 4.0 ); printFirstElementOnDevice<<<1,1>>>( dVector ); cudaDeviceSynchronize(); // assignScalar<false>( SIZE, hVector, 5.0 ); // cout << "hVector[0] = " << hVector[0] << endl; // // assignScalar<true>( SIZE, dVector, 6.0 ); // printFirstElementOnDevice<<<1,1>>>( dVector ); // cudaDeviceSynchronize(); cudaError_t error = cudaGetLastError(); if(error!=cudaSuccess) { cout << "ERROR: " << cudaGetErrorString(error); } }
I used the CUDA 7.5 product release.
Update
I tried this third version of the assignScalar function:
template<bool onDevice> void assignScalar3( size_t size, double* vector, double a ) { #ifdef __CUDA_ARCH__ #define LAMBDA_HOST_DEVICE __device__ #else #define LAMBDA_HOST_DEVICE __host__ #endif auto assign = [=] LAMBDA_HOST_DEVICE ( size_t index ) { vector[index] = a; }; if( onDevice ) { foreachDevice( size, assign ); } else { foreachHost( size, assign ); } }
It compiles and starts without errors, but the device version ( assignScalar3<true> ) fails. Actually, I thought that __CUDA_ARCH__ would always be undefined (since the function is not __device__ ), but I explicitly indicated that there is a compilation path where it is defined.