The CUDA C Best Practices Guide Version 5.0, Section 6.1.2, states that:
Unlike cudaMemcpy (), the version of asynchronous transfer requires fixed host memory (see โMemory with pinningโ), and it contains an additional argument, the thread identifier.
This means that the cudaMemcpyAsync function should fail if I use simple memory.
But that is not what happened.
Just for testing, I tried the following program:
Nucleus:
__global__ void kernel_increment(float* src, float* dst, int n) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if(tid<n) dst[tid] = src[tid] + 1.0f; }
Main:
int main() { float *hPtr1, *hPtr2, *dPtr1, *dPtr2; const int n = 1000; size_t bytes = n * sizeof(float); cudaStream_t str1, str2; hPtr1 = new float[n]; hPtr2 = new float[n]; for(int i=0; i<n; i++) hPtr1[i] = static_cast<float>(i); cudaMalloc<float>(&dPtr1,bytes); cudaMalloc<float>(&dPtr2,bytes); dim3 block(16); dim3 grid((n + block.x - 1)/block.x); cudaStreamCreate(&str1); cudaStreamCreate(&str2); cudaMemcpyAsync(dPtr1,hPtr1,bytes,cudaMemcpyHostToDevice,str1); kernel_increment<<<grid,block,0,str2>>>(dPtr1,dPtr2,n); cudaMemcpyAsync(hPtr2,dPtr2,bytes,cudaMemcpyDeviceToHost,str1); printf("Status: %s\n",cudaGetErrorString(cudaGetLastError())); cudaDeviceSynchronize(); printf("Status: %s\n",cudaGetErrorString(cudaGetLastError())); cudaStreamDestroy(str1); cudaStreamDestroy(str2); cudaFree(dPtr1); cudaFree(dPtr2); for(int i=0; i<n; i++) std::cout<<hPtr2[i]<<std::endl; delete[] hPtr1; delete[] hPtr2; return 0; }
The program gave the correct result. The array increased successfully.
How did cudaMemcpyAsync run without page locking? Did I miss something?