, ZeroCopy, cudaHostAllocMapped cudaHostAlloc.
cudaHostAlloc((void **)&pinnedHostPtr, THREADS * sizeof(double), cudaHostAllocMapped);
pinnedHostPointer . , :
double* dPtr;
cudaHostGetDevicePointer(&dPtr, pinnedHostPtr, 0);
.
testPinnedMemory<<< numBlocks, threadsPerBlock>>>(dPtr);
, . cudaDeviceSynchronize .
, , , 64- Compute Capability 2.0 TCC. , + , cudaHostAlloc .
:
#include <cstdio>
__global__ void testPinnedMemory(double * mem)
{
double currentValue = mem[threadIdx.x];
printf("Thread id: %d, memory content: %f\n", threadIdx.x, currentValue);
mem[threadIdx.x] = currentValue+10;
}
int main()
{
const size_t THREADS = 8;
double * pinnedHostPtr;
cudaHostAlloc((void **)&pinnedHostPtr, THREADS * sizeof(double), cudaHostAllocMapped);
for (size_t i = 0; i < THREADS; ++i)
pinnedHostPtr[i] = i;
double* dPtr;
cudaHostGetDevicePointer(&dPtr, pinnedHostPtr, 0);
dim3 threadsPerBlock(THREADS);
dim3 numBlocks(1);
testPinnedMemory<<< numBlocks, threadsPerBlock>>>(dPtr);
cudaDeviceSynchronize();
printf("Data after kernel execution: ");
for (int i = 0; i < THREADS; ++i)
printf("%f ", pinnedHostPtr[i]);
printf("\n");
return 0;
}