EDIT: It was here but it was no longer found, but most of the example was copied below.
Calling C (but may be C ++)
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <cuda.h> extern void kernel_wrapper(int *a, int *b); int main(int argc, char *argv[]) { int a = 2; int b = 3; kernel_wrapper(&a, &b); return 0; }
Callee (CUDA)
__global__ void kernel(int *a, int *b) { int tx = threadIdx.x; switch( tx ) { case 0: *a = *a + 10; break; case 1: *b = *b + 3; break; default: break; } } void kernel_wrapper(int *a, int *b) { int *d_1, *d_2; dim3 threads( 2, 1 ); dim3 blocks( 1, 1 ); cudaMalloc( (void **)&d_1, sizeof(int) ); cudaMalloc( (void **)&d_2, sizeof(int) ); cudaMemcpy( d_1, a, sizeof(int), cudaMemcpyHostToDevice ); cudaMemcpy( d_2, b, sizeof(int), cudaMemcpyHostToDevice ); kernel<<< blocks, threads >>>( a, b ); cudaMemcpy( a, d_1, sizeof(int), cudaMemcpyDeviceToHost ); cudaMemcpy( b, d_2, sizeof(int), cudaMemcpyDeviceToHost ); cudaFree(d_1); cudaFree(d_2); }
Preet sangha
source share