With the introduction of the cuFFT callback function, the normalization required by the inverse FFT performed by cuFFT can be built directly into the cufftExecC2C call, defining the normalization operation as the __device__ function.
In addition to the cuFFT user guide, for cuFFT callback functions, see
CUDA Pro Tip: Use cuFFT Callbacks for Custom Data Processing
The following is an example of implementing IFFT normalization using the cuFFT callback.
#include <stdio.h> #include <assert.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <cufft.h> #include <cufftXt.h> /********************/ /* CUDA ERROR CHECK */ /********************/ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } /*********************/ /* CUFFT ERROR CHECK */ /*********************/ // See http://stackoverflow.com/questions/16267149/cufft-error-handling #ifdef _CUFFT_H_ static const char *_cudaGetErrorEnum(cufftResult error) { switch (error) { case CUFFT_SUCCESS: return "CUFFT_SUCCESS"; case CUFFT_INVALID_PLAN: return "CUFFT_INVALID_PLAN"; case CUFFT_ALLOC_FAILED: return "CUFFT_ALLOC_FAILED"; case CUFFT_INVALID_TYPE: return "CUFFT_INVALID_TYPE"; case CUFFT_INVALID_VALUE: return "CUFFT_INVALID_VALUE"; case CUFFT_INTERNAL_ERROR: return "CUFFT_INTERNAL_ERROR"; case CUFFT_EXEC_FAILED: return "CUFFT_EXEC_FAILED"; case CUFFT_SETUP_FAILED: return "CUFFT_SETUP_FAILED"; case CUFFT_INVALID_SIZE: return "CUFFT_INVALID_SIZE"; case CUFFT_UNALIGNED_DATA: return "CUFFT_UNALIGNED_DATA"; } return "<unknown>"; } #endif #define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { if( CUFFT_SUCCESS != err) { fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ _cudaGetErrorEnum(err)); \ cudaDeviceReset(); assert(0); \ } } __device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) { float *scaling_factor = (float*)callerInfo; float2 output; output.x = cuCrealf(element); output.y = cuCimagf(element); output.x = output.x / scaling_factor[0]; output.y = output.y / scaling_factor[0]; ((float2*)dataOut)[offset] = output; } __device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling; /********/ /* MAIN */ /********/ int main() { const int N = 16; cufftHandle plan; float2 *h_input = (float2*)malloc(N*sizeof(float2)); float2 *h_output1 = (float2*)malloc(N*sizeof(float2)); float2 *h_output2 = (float2*)malloc(N*sizeof(float2)); float2 *d_input; gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2))); float2 *d_output1; gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2))); float2 *d_output2; gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2))); float *h_scaling_factor = (float*)malloc(sizeof(float)); h_scaling_factor[0] = 16.0f; float *d_scaling_factor; gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float))); gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice)); for (int i=0; i<N; i++) { h_input[i].x = 1.0f; h_input[i].y = 0.f; } gpuErrchk(cudaMemcpy(d_input, h_input, N*sizeof(float2), cudaMemcpyHostToDevice)); cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1)); cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD)); gpuErrchk(cudaMemcpy(h_output1, d_output1, N*sizeof(float2), cudaMemcpyDeviceToHost)); for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y); cufftCallbackStoreC h_storeCallbackPtr; gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr))); cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor)); cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE)); gpuErrchk(cudaMemcpy(h_output2, d_output2, N*sizeof(float2), cudaMemcpyDeviceToHost)); for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y); cufftSafeCall(cufftDestroy(plan)); gpuErrchk(cudaFree(d_input)); gpuErrchk(cudaFree(d_output1)); gpuErrchk(cudaFree(d_output2)); return 0; }
EDIT
The “moment” performed by the callback operation is specified in CUFFT_CB_ST_COMPLEX when calling cufftXtSetCallback . Please note that after this you can load and save callbacks with the same cuFFT plan.