CuFFT and streams

I am trying to run multiple FFT CUDA cores asynchronously using threads. To do this, I create my cuFFT streams forward and backward, as shown below:

streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum); plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum); plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum); for(int i=0; i<streamNum; i++) { cudaStreamCreate(&streams[i]); CHECK_ERROR(5) cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1); CHECK_ERROR(5) cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1); CHECK_ERROR(5) cufftSetStream(plansF[i],streams[i]); CHECK_ERROR(5) cufftSetStream(plansI[i],streams[i]); CHECK_ERROR(5) } 

In the main function, I run the FFT as follows:

 for(w=1;w<q;w++) { cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j); CHECK_ERROR(8) k += rect_small_real; j += rect_small_complex; } 

I also have other kernels that I run asynchronously with the same threads.

When I view my application using Visual Profiler 5.0, I see that all the kernels except CUDA FFT (both forward and reverse) run in parallel and overlap. FFT cores run in different threads, but they do not overlap since they actually run sequentially. Can someone tell me what my problem is?

My environment is VS 2008, 64 bit, Windows 7.

Thanks.

+7
source share
3 answers

This is a processed example of cuFFT and memcopies using streams in CUDA on the Kepler architecture.

Here is the code:

 #include <stdio.h> #include <cufft.h> #define NUM_STREAMS 3 /********************/ /* CUDA ERROR CHECK */ /********************/ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } /********/ /* MAIN */ /********/ int main() { const int N = 5000; // --- Host input data initialization float2 *h_in1 = new float2[N]; float2 *h_in2 = new float2[N]; float2 *h_in3 = new float2[N]; for (int i = 0; i < N; i++) { h_in1[i].x = 1.f; h_in1[i].y = 0.f; h_in2[i].x = 1.f; h_in2[i].y = 0.f; h_in3[i].x = 1.f; h_in3[i].y = 0.f; } // --- Host output data initialization float2 *h_out1 = new float2[N]; float2 *h_out2 = new float2[N]; float2 *h_out3 = new float2[N]; for (int i = 0; i < N; i++) { h_out1[i].x = 0.f; h_out1[i].y = 0.f; h_out2[i].x = 0.f; h_out2[i].y = 0.f; h_out3[i].x = 0.f; h_out3[i].y = 0.f; } // --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync) gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable)); gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable)); gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable)); gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable)); gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable)); gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable)); // --- Device input data allocation float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2))); float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2))); float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2))); float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2))); float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2))); float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2))); // --- Creates CUDA streams cudaStream_t streams[NUM_STREAMS]; for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i])); // --- Creates cuFFT plans and sets them in streams cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS); for (int i = 0; i < NUM_STREAMS; i++) { cufftPlan1d(&plans[i], N, CUFFT_C2C, 1); cufftSetStream(plans[i], streams[i]); } // --- Async memcopyes and computations gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0])); gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1])); gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2])); cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD); cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD); cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD); gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0])); gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1])); gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2])); for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i])); // --- Releases resources gpuErrchk(cudaHostUnregister(h_in1)); gpuErrchk(cudaHostUnregister(h_in2)); gpuErrchk(cudaHostUnregister(h_in3)); gpuErrchk(cudaHostUnregister(h_out1)); gpuErrchk(cudaHostUnregister(h_out2)); gpuErrchk(cudaHostUnregister(h_out3)); gpuErrchk(cudaFree(d_in1)); gpuErrchk(cudaFree(d_in2)); gpuErrchk(cudaFree(d_in3)); gpuErrchk(cudaFree(d_out1)); gpuErrchk(cudaFree(d_out2)); gpuErrchk(cudaFree(d_out3)); for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i])); delete[] h_in1; delete[] h_in2; delete[] h_in3; delete[] h_out1; delete[] h_out2; delete[] h_out3; cudaDeviceReset(); return 0; } 

Please add cuFFT error checking according to CUFFT error handling .

Below is some profiling information when testing the above algorithm on a Kepler K20c card. As you will see, you will achieve a true match between calculations and memory transfers only if you have a sufficiently large N

N = 5000

enter image description here

N = 50000

enter image description here

N = 500000

enter image description here

+6
source

The problem is the equipment you are using.

All GPUs supporting CUDA are capable of running the kernel and copying data in both directions at the same time. However, only devices with Compute Capability 3.5 have the Hyper-Q feature.

In short, several (16, presumably) hardware cores are implemented on these GPUs. The previous GPU has one hardware queue.

This means that cudaStreams are only virtual, and their use for old equipment makes sense only in the case of overlapping calculations and copying memory. Of course, this is true not only for cuFFT, but also for your own cores!

Please take a deep look inside the output of the visual profiler. You may inadvertently think of visualizing a timeline with accurate data for the execution of the GPU. However, this is not so simple. There are several lines where the displayed data may refer to the time point at which the kernel launch line was launched (usually orange). And this line corresponds to the execution of a certain core on the GPU (blue rectangles). The same goes for memory transfer (the exact time is displayed as light brown rectangles).

Hope I helped you solve your problem.

+2
source

Here's the riff in @JackOLantern code, which makes it easy to vary the number of FFTs, the length of the FFT, and the number of threads to experiment using the GPU in nvvp.

 // Compile with: // nvcc --std=c++11 stream_parallel.cu -o stream_parallel -lcufft #include <iostream> #include <cuda.h> #include <cuda_runtime.h> #include <cufft.h> // Print file name, line number, and error code when a CUDA error occurs. #define check_cuda_errors(val) __check_cuda_errors__ ( (val), #val, __FILE__, __LINE__ ) template <typename T> inline void __check_cuda_errors__(T code, const char *func, const char *file, int line) { if (code) { std::cout << "CUDA error at " << file << ":" << line << std::endl << "error code: " << (unsigned int) code << " type: \"" << cudaGetErrorString(cudaGetLastError()) << "\"" << std::endl << "func: \"" << func << "\"" << std::endl; cudaDeviceReset(); exit(EXIT_FAILURE); } } int main(int argc, char *argv[]) { // Number of FFTs to compute. const int NUM_DATA = 64; // Length of each FFT. const int N = 1048576; // Number of GPU streams across which to distribute the FFTs. const int NUM_STREAMS = 4; // Allocate and initialize host input data. float2 **h_in = new float2 *[NUM_STREAMS]; for (int ii = 0; ii < NUM_STREAMS; ii++) { h_in[ii] = new float2[N]; for (int jj = 0; jj < N; ++jj) { h_in[ii][jj].x = (float) 1.f; h_in[ii][jj].y = (float) 0.f; } } // Allocate and initialize host output data. float2 **h_out = new float2 *[NUM_STREAMS]; for (int ii = 0; ii < NUM_STREAMS; ii++) { h_out[ii] = new float2[N]; for (int jj = 0; jj < N; ++jj) { h_out[ii][jj].x = 0.f; h_out[ii][jj].y = 0.f; } } // Pin host input and output memory for cudaMemcpyAsync. for (int ii = 0; ii < NUM_STREAMS; ii++) { check_cuda_errors(cudaHostRegister(h_in[ii], N*sizeof(float2), cudaHostRegisterPortable)); check_cuda_errors(cudaHostRegister(h_out[ii], N*sizeof(float2), cudaHostRegisterPortable)); } // Allocate pointers to device input and output arrays. float2 **d_in = new float2 *[NUM_STREAMS]; float2 **d_out = new float2 *[NUM_STREAMS]; // Allocate intput and output arrays on device. for (int ii = 0; ii < NUM_STREAMS; ii++) { check_cuda_errors(cudaMalloc((void**)&d_in[ii], N*sizeof(float2))); check_cuda_errors(cudaMalloc((void**)&d_out[ii], N*sizeof(float2))); } // Create CUDA streams. cudaStream_t streams[NUM_STREAMS]; for (int ii = 0; ii < NUM_STREAMS; ii++) { check_cuda_errors(cudaStreamCreate(&streams[ii])); } // Creates cuFFT plans and sets them in streams cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS); for (int ii = 0; ii < NUM_STREAMS; ii++) { cufftPlan1d(&plans[ii], N, CUFFT_C2C, 1); cufftSetStream(plans[ii], streams[ii]); } // Fill streams with async memcopies and FFTs. for (int ii = 0; ii < NUM_DATA; ii++) { int jj = ii % NUM_STREAMS; check_cuda_errors(cudaMemcpyAsync(d_in[jj], h_in[jj], N*sizeof(float2), cudaMemcpyHostToDevice, streams[jj])); cufftExecC2C(plans[jj], (cufftComplex*)d_in[jj], (cufftComplex*)d_out[jj], CUFFT_FORWARD); check_cuda_errors(cudaMemcpyAsync(h_out[jj], d_out[jj], N*sizeof(float2), cudaMemcpyDeviceToHost, streams[jj])); } // Wait for calculations to complete. for(int ii = 0; ii < NUM_STREAMS; ii++) { check_cuda_errors(cudaStreamSynchronize(streams[ii])); } // Free memory and streams. for (int ii = 0; ii < NUM_STREAMS; ii++) { check_cuda_errors(cudaHostUnregister(h_in[ii])); check_cuda_errors(cudaHostUnregister(h_out[ii])); check_cuda_errors(cudaFree(d_in[ii])); check_cuda_errors(cudaFree(d_out[ii])); delete[] h_in[ii]; delete[] h_out[ii]; check_cuda_errors(cudaStreamDestroy(streams[ii])); } delete plans; cudaDeviceReset(); return 0; } 
0
source

All Articles