CUDA pins memory from device

CUDA 5, device capabilities 3.5, VS 2012, 64-bit Win 2012 Server.

There is no access to shared memory between threads; each thread is autonomous.

I use fixed memory with zero copy. From the host, I can only read the pinned memory that the device wrote only when I cudaDeviceSynchronize on the host.

I want to be able to:

  • Reset to the fixed memory as soon as the device updates it.
  • Do not block the device stream (possibly by asynchronous copying)

I tried calling __threadfence_system and __threadfence after each device, but this did not work.

Below is a complete sample CUDA code that demonstrates my question:

 #include <conio.h> #include <cstdio> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" __global__ void Kernel(volatile float* hResult) { int tid = threadIdx.x + blockIdx.x * blockDim.x; printf("Kernel %u: Before Writing in Kernel\n", tid); hResult[tid] = tid + 1; __threadfence_system(); // expecting that the data is getting flushed to host here! printf("Kernel %u: After Writing in Kernel\n", tid); // time waster for-loop (sleep) for (int timeWater = 0; timeWater < 100000000; timeWater++); } void main() { size_t blocks = 2; volatile float* hResult; cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped); Kernel<<<1,blocks>>>(hResult); int filledElementsCounter = 0; // naiive thread implementation that can be impelemted using // another host thread while (filledElementsCounter < blocks) { // blocks until the value changes, this moves sequentially // while threads have no order (fine for this sample). while(hResult[filledElementsCounter] == 0); printf("%f\n", hResult[filledElementsCounter]);; filledElementsCounter++; } cudaFreeHost((void *)hResult); system("pause"); } 

Currently, this example will wait indefinitely since nothing is read from the device unless I cudaDeviceSynchronize . The sample below works, but it is NOT what I want, since it defeats the goal of asynchronous copy:

 void main() { size_t blocks = 2; volatile float* hResult; cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped); Kernel<<<1,blocks>>>(hResult); cudaError_t error = cudaDeviceSynchronize(); if (error != cudaSuccess) { throw; } for(int i = 0; i < blocks; i++) { printf("%f\n", hResult[i]); } cudaFreeHost((void *)hResult); system("pause"); } 
+2
source share
3 answers

You cannot pass a node pointer directly to the kernel. If you assign host memory using cudaHostAlloc with the cudaHostAllocMapped flag, you must first obtain a device pointer for the associated host memory before you can use it in the kernel. Use cudaHostGetDevicePointer to get a pointer to the mapped host memory device.

 float* hResult, *dResult; cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped); cudaHostGetDevicePointer(&dResult,hResult); Kernel<<<1,blocks>>>(dResult); 
+2
source

Calling __threadfence_system() ensures that the record is visible to the system before continuing, but your processor will cache the h_result variable, and therefore, you simply rotate the old value in an infinite loop. Try marking h_result as volatile .

+2
source

I played with your code on Centos 6.2 with CUDA 5.5 and Tesla M2090 and I can conclude this:

The problem that it does not work on your system should be a driver problem, and I suggest you get the TCC drivers.

I have attached my code that works great and does what you want. Values ​​are displayed on the host side until the kernel ends. As you can see, I added some calculation code to exclude the for loop due to compiler optimization. I have added a thread and a callback that are executed after all work in the thread is completed. The program displays 1 2 and for a long time does nothing until stream finished... is printed on the console.

  #include <iostream> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" #define SEC_CUDA_CALL(val) checkCall ( (val), #val, __FILE__, __LINE__ ) bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line) { if (result != cudaSuccess) { std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl; } return result != cudaSuccess; } class Callback { public: static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData); private: void call(); }; void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData) { Callback* cb = (Callback*) userData; cb->call(); } void Callback::call() { std::cout << "stream finished..." << std::endl; } __global__ void Kernel(volatile float* hResult) { int tid = threadIdx.x + blockIdx.x * blockDim.x; hResult[tid] = tid + 1; __threadfence_system(); float A = 0; for (int timeWater = 0; timeWater < 100000000; timeWater++) { A = sin(cos(log(hResult[0] * hResult[1]))) + A; A = sqrt(A); } } int main(int argc, char* argv[]) { size_t blocks = 2; volatile float* hResult; SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped)); cudaStream_t stream; SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); Callback obj; Kernel<<<1,blocks,NULL,stream>>>(hResult); SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0)); int filledElementsCounter = 0; while (filledElementsCounter < blocks) { while(hResult[filledElementsCounter] == 0); std::cout << hResult[filledElementsCounter] << std::endl; filledElementsCounter++; } SEC_CUDA_CALL(cudaStreamDestroy(stream)); SEC_CUDA_CALL(cudaFreeHost((void *)hResult)); } 

No call returned an error, and cuda-memcheck did not detect any problems. It works as intended. You really should try the TCC driver.

+2
source

All Articles