CUDA 7.5 experimental __host__ __device__ lambdas

I played a little with the experimental lambdas device , which was introduced in CUDA 7.5 and promoted in this blog post by Mark Harris .

In the following example, I removed a lot of things that are not needed to show my problem (my actual implementation looks a little better ...).

I tried to write a foreach function that works either on vectors on a device (1 thread per element) or on a host (serial) depending on the template parameter. With this foreach function, I can easily implement the BLAS functions. As an example, I use scalar assignment to each component of the vector (at the end I add the complete code):

template<bool onDevice> void assignScalar( size_t size, double* vector, double a ) { auto assign = [=] __host__ __device__ ( size_t index ) { vector[index] = a; }; if( onDevice ) { foreachDevice( size, assign ); } else { foreachHost( size, assign ); } } 

However, this code gives a compiler error due to __host__ __device__ lambda:

The closure type for lambda ("lambda → void") cannot be used in the template argument type of the __global__ function template instance, unless the lambda is defined in the __device__ or __global__ function

I get the same error if I __device__ from the lambda expression, and I don't get a compilation error if I __host__ (only __device__ lambda), but in this case the host part fails ...

If I define lambda as __host__ or __device__ separately, the code compiles and works as expected.

 template<bool onDevice> void assignScalar2( size_t size, double* vector, double a ) { if( onDevice ) { auto assign = [=] __device__ ( size_t index ) { vector[index] = a; }; foreachDevice( size, assign ); } else { auto assign = [=] __host__ ( size_t index ) { vector[index] = a; }; foreachHost( size, assign ); } } 

However, this leads to code duplication and actually makes the whole idea of ​​using lambdas useless for this example.

Is there a way to accomplish what I want to do, or is it an error in an experimental function? In fact, the definition of a __host__ __device__ lambda is explicitly mentioned in the first example in the programming guide . Even for this simpler example (just return a constant value from lambda), I could not find a way to use the lambda expression on the host and device.

Here is the complete code, compile with the options -std=c++11 --expt-extended-lambda :

 #include <iostream> using namespace std; template<typename Operation> void foreachHost( size_t size, Operation o ) { for( size_t i = 0; i < size; ++i ) { o( i ); } } template<typename Operation> __global__ void kernel_foreach( Operation o ) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; o( index ); } template<typename Operation> void foreachDevice( size_t size, Operation o ) { size_t blocksize = 32; size_t gridsize = size/32; kernel_foreach<<<gridsize,blocksize>>>( o ); } __global__ void printFirstElementOnDevice( double* vector ) { printf( "dVector[0] = %f\n", vector[0] ); } void assignScalarHost( size_t size, double* vector, double a ) { auto assign = [=] ( size_t index ) { vector[index] = a; }; foreachHost( size, assign ); } void assignScalarDevice( size_t size, double* vector, double a ) { auto assign = [=] __device__ ( size_t index ) { vector[index] = a; }; foreachDevice( size, assign ); } // compile error: template<bool onDevice> void assignScalar( size_t size, double* vector, double a ) { auto assign = [=] __host__ __device__ ( size_t index ) { vector[index] = a; }; if( onDevice ) { foreachDevice( size, assign ); } else { foreachHost( size, assign ); } } // works: template<bool onDevice> void assignScalar2( size_t size, double* vector, double a ) { if( onDevice ) { auto assign = [=] __device__ ( size_t index ) { vector[index] = a; }; foreachDevice( size, assign ); } else { auto assign = [=] __host__ ( size_t index ) { vector[index] = a; }; foreachHost( size, assign ); } } int main() { size_t SIZE = 32; double* hVector = new double[SIZE]; double* dVector; cudaMalloc( &dVector, SIZE*sizeof(double) ); // clear memory for( size_t i = 0; i < SIZE; ++i ) { hVector[i] = 0; } cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice ); assignScalarHost( SIZE, hVector, 1.0 ); cout << "hVector[0] = " << hVector[0] << endl; assignScalarDevice( SIZE, dVector, 2.0 ); printFirstElementOnDevice<<<1,1>>>( dVector ); cudaDeviceSynchronize(); assignScalar2<false>( SIZE, hVector, 3.0 ); cout << "hVector[0] = " << hVector[0] << endl; assignScalar2<true>( SIZE, dVector, 4.0 ); printFirstElementOnDevice<<<1,1>>>( dVector ); cudaDeviceSynchronize(); // assignScalar<false>( SIZE, hVector, 5.0 ); // cout << "hVector[0] = " << hVector[0] << endl; // // assignScalar<true>( SIZE, dVector, 6.0 ); // printFirstElementOnDevice<<<1,1>>>( dVector ); // cudaDeviceSynchronize(); cudaError_t error = cudaGetLastError(); if(error!=cudaSuccess) { cout << "ERROR: " << cudaGetErrorString(error); } } 

I used the CUDA 7.5 product release.

Update

I tried this third version of the assignScalar function:

 template<bool onDevice> void assignScalar3( size_t size, double* vector, double a ) { #ifdef __CUDA_ARCH__ #define LAMBDA_HOST_DEVICE __device__ #else #define LAMBDA_HOST_DEVICE __host__ #endif auto assign = [=] LAMBDA_HOST_DEVICE ( size_t index ) { vector[index] = a; }; if( onDevice ) { foreachDevice( size, assign ); } else { foreachHost( size, assign ); } } 

It compiles and starts without errors, but the device version ( assignScalar3<true> ) fails. Actually, I thought that __CUDA_ARCH__ would always be undefined (since the function is not __device__ ), but I explicitly indicated that there is a compilation path where it is defined.

+5
source share
1 answer

The task I tried to accomplish with the examples presented in the question is not possible with CUDA 7.5 , although it was not explicitly excluded from the permitted cases for experimental support for lambda.

NVIDIA announced that CUDA Toolkit 8.0 will support __host__ __device__ lambdas as an experimental feature, according to a blog post CUDA 8 Features Revealed .

I confirmed that my example works with CUDA 8 Release Candidate (Cuda compilation tools, release 8.0, V8.0.26).

Here is the code I finally used compiled with nvcc -std=c++11 --expt-extended-lambda :

 #include <iostream> using namespace std; template<typename Operation> __global__ void kernel_foreach( Operation o ) { size_t i = blockIdx.x * blockDim.x + threadIdx.x; o( i ); } template<bool onDevice, typename Operation> void foreach( size_t size, Operation o ) { if( onDevice ) { size_t blocksize = 32; size_t gridsize = size/32; kernel_foreach<<<gridsize,blocksize>>>( o ); } else { for( size_t i = 0; i < size; ++i ) { o( i ); } } } __global__ void printFirstElementOnDevice( double* vector ) { printf( "dVector[0] = %f\n", vector[0] ); } template<bool onDevice> void assignScalar( size_t size, double* vector, double a ) { auto assign = [=] __host__ __device__ ( size_t i ) { vector[i] = a; }; foreach<onDevice>( size, assign ); } int main() { size_t SIZE = 32; double* hVector = new double[SIZE]; double* dVector; cudaMalloc( &dVector, SIZE*sizeof(double) ); // clear memory for( size_t i = 0; i < SIZE; ++i ) { hVector[i] = 0; } cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice ); assignScalar<false>( SIZE, hVector, 3.0 ); cout << "hVector[0] = " << hVector[0] << endl; assignScalar<true>( SIZE, dVector, 4.0 ); printFirstElementOnDevice<<<1,1>>>( dVector ); cudaDeviceSynchronize(); cudaError_t error = cudaGetLastError(); if(error!=cudaSuccess) { cout << "ERROR: " << cudaGetErrorString(error); } } 
+3
source

All Articles