CUDA 5, device capabilities 3.5, VS 2012, 64-bit Win 2012 Server.
There is no access to shared memory between threads; each thread is autonomous.
I use fixed memory with zero copy. From the host, I can only read the pinned memory that the device wrote only when I cudaDeviceSynchronize on the host.
I want to be able to:
- Reset to the fixed memory as soon as the device updates it.
- Do not block the device stream (possibly by asynchronous copying)
I tried calling __threadfence_system and __threadfence after each device, but this did not work.
Below is a complete sample CUDA code that demonstrates my question:
#include <conio.h> #include <cstdio> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" __global__ void Kernel(volatile float* hResult) { int tid = threadIdx.x + blockIdx.x * blockDim.x; printf("Kernel %u: Before Writing in Kernel\n", tid); hResult[tid] = tid + 1; __threadfence_system(); // expecting that the data is getting flushed to host here! printf("Kernel %u: After Writing in Kernel\n", tid); // time waster for-loop (sleep) for (int timeWater = 0; timeWater < 100000000; timeWater++); } void main() { size_t blocks = 2; volatile float* hResult; cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped); Kernel<<<1,blocks>>>(hResult); int filledElementsCounter = 0; // naiive thread implementation that can be impelemted using // another host thread while (filledElementsCounter < blocks) { // blocks until the value changes, this moves sequentially // while threads have no order (fine for this sample). while(hResult[filledElementsCounter] == 0); printf("%f\n", hResult[filledElementsCounter]);; filledElementsCounter++; } cudaFreeHost((void *)hResult); system("pause"); }
Currently, this example will wait indefinitely since nothing is read from the device unless I cudaDeviceSynchronize . The sample below works, but it is NOT what I want, since it defeats the goal of asynchronous copy:
void main() { size_t blocks = 2; volatile float* hResult; cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped); Kernel<<<1,blocks>>>(hResult); cudaError_t error = cudaDeviceSynchronize(); if (error != cudaSuccess) { throw; } for(int i = 0; i < blocks; i++) { printf("%f\n", hResult[i]); } cudaFreeHost((void *)hResult); system("pause"); }
Adam source share