Cuda: problems with copy memory with fixed memory

I tried the code in this link Do I need a copy of CUDA memory? The one who asked claims that the program did a great job for him. But it doesn’t work the same way. My values ​​don’t change if I manipulate them in the kernel.

Mostly my problem is that my GPU memory is not enough, but I want to do calculations that require more memory. I use my program to use RAM or host memory and can use CUDA for calculations. The program in the link seemed to solve my problem, but the code does not give a result, as the guy showed.

Any help or any working example in Zero copy memory will be helpful.

thank

__global__ void testPinnedMemory(double * mem)
{
double currentValue = mem[threadIdx.x];
printf("Thread id: %d, memory content: %f\n", threadIdx.x, currentValue);
mem[threadIdx.x] = currentValue+10;
}

void test() 
{
const size_t THREADS = 8;
double * pinnedHostPtr;
cudaHostAlloc((void **)&pinnedHostPtr, THREADS, cudaHostAllocDefault);

//set memory values
for (size_t i = 0; i < THREADS; ++i)
    pinnedHostPtr[i] = i;

//call kernel
dim3 threadsPerBlock(THREADS);
dim3 numBlocks(1);
testPinnedMemory<<< numBlocks, threadsPerBlock>>>(pinnedHostPtr);

//read output
printf("Data after kernel execution: ");
for (int i = 0; i < THREADS; ++i)
    printf("%f ", pinnedHostPtr[i]);    
printf("\n");
}
+4
1

, ZeroCopy, cudaHostAllocMapped cudaHostAlloc.

cudaHostAlloc((void **)&pinnedHostPtr, THREADS * sizeof(double), cudaHostAllocMapped);

pinnedHostPointer . , :

double* dPtr;
cudaHostGetDevicePointer(&dPtr, pinnedHostPtr, 0);

.

testPinnedMemory<<< numBlocks, threadsPerBlock>>>(dPtr);

, . cudaDeviceSynchronize .

, , , 64- Compute Capability 2.0 TCC. , + , cudaHostAlloc .

:

#include <cstdio>

__global__ void testPinnedMemory(double * mem)
{
    double currentValue = mem[threadIdx.x];
    printf("Thread id: %d, memory content: %f\n", threadIdx.x, currentValue);
    mem[threadIdx.x] = currentValue+10;
}

int main() 
{
    const size_t THREADS = 8;
    double * pinnedHostPtr;
    cudaHostAlloc((void **)&pinnedHostPtr, THREADS * sizeof(double), cudaHostAllocMapped);

    //set memory values
    for (size_t i = 0; i < THREADS; ++i)
        pinnedHostPtr[i] = i;

    double* dPtr;
    cudaHostGetDevicePointer(&dPtr, pinnedHostPtr, 0);

    //call kernel
    dim3 threadsPerBlock(THREADS);
    dim3 numBlocks(1);
    testPinnedMemory<<< numBlocks, threadsPerBlock>>>(dPtr);
    cudaDeviceSynchronize();

    //read output
    printf("Data after kernel execution: ");
    for (int i = 0; i < THREADS; ++i)
        printf("%f ", pinnedHostPtr[i]);    
    printf("\n");

    return 0;
}
+8

All Articles