OpenCL: clAmdFft (OpenCL FFT lib from AMD) on NVIDIA GPUs

Does anyone encounter running the OpenCL FFT library from AMD (http://developer.amd.com/libraries/appmathlibs/pages/default.aspx) on NVIDIA GPUs?

I am trying to migrate an existing algorithm from CUDA (with the latest CUFFT) to OpenCL. The new code works fine with the AMD GPU, but not with my NVIDIA GPU. The NVIDIA GPU is recognized properly, but the resulting array is zero, without any errors. By the way, the code also works on the Intel Core i3 processor. This way my code looks fine.

AMD, as well as NVIDIA, seem to be giving up support for this topic.

Any ideas?

EDIT:

My environment is Windows 7 Professional x64, and I use Visual Studio C ++ Professional IDE with its built-in x86 compiler. The NVIDIA GPU is the GeForce GTX 560 Ti (MSI N560GTX-Ti Twin Frozr II / OC 1GB). A well-functioning processor is the Intel Core i3-2100 (2x3.1 GHz), as well as the Radeon HD 6850 (Sapphire Radeon HD 6850 1GB). I tried to compile code with the latest OpenCL versions for AMD, NVIDIA and Intel with the same results and led to the latest developer drivers.

Here is my pretty simple code example ...

#include <stdio.h> #include <stdlib.h> #include <complex> #include <clAmdFft.h> #if defined (__APPLE__) || defined(MACOSX) #include <OpenCL/opencl.h> #else #include <CL/opencl.h> #endif // Typedef for complex field objects using namespace std; typedef std::complex<float> cl_compl_flt; int main(int argc, char* argv[]) { cl_uint width = 1024, height = 1024; // Field dimensions cl_uint cl_platformsN = 0; // Platform count cl_platform_id *cl_platformIDs = NULL; // IDs of OpenCL platforms cl_uint cl_deviceCount = 0; // Device count cl_device_id *cl_devices = NULL; // Device IDs cl_int cl_err = 0; // Buffer for error informations cl_context cl_dev_context; // Context cl_command_queue cl_queue; // Queue clAmdFftSetupData fftSetupData; // FFT setup data clAmdFftPlanHandle fftPlan; // FFT plan clAmdFftDim fftDim = CLFFT_2D; // FFT dimension size_t fftSize[2]; // FFT size fftSize[0] = width; fftSize[1] = height; cl_mem d_data; // Device level data cl_compl_flt* h_src; // Host level input data cl_compl_flt* h_res; // Host level output data // Allocate host memory h_src = (cl_compl_flt*)malloc(width*height*sizeof(cl_compl_flt)); h_res = (cl_compl_flt*)malloc(width*height*sizeof(cl_compl_flt)); // Get source field createPinholeField( h_src, width, height, 5 ); // Get FFT version checkCL( clAmdFftInitSetupData(&fftSetupData) ); printf("Using clAmdFft %u.%u.%u\n",fftSetupData.major,fftSetupData.minor,fftSetupData.patch); // Get available platforms checkCL( clGetPlatformIDs ( 0, NULL, &cl_platformsN)); cl_platformIDs = (cl_platform_id*) malloc( cl_platformsN * sizeof(cl_platform_id)); checkCL( clGetPlatformIDs( cl_platformsN, cl_platformIDs, NULL) ); // Loop over platforms for( cl_uint i = 0; i < cl_platformsN; i++) { // Get number of available devices for this platform checkCL( clGetDeviceIDs( cl_platformIDs[i], CL_DEVICE_TYPE_ALL, NULL, NULL, &cl_deviceCount)); // Skip platform if no device available if(cl_deviceCount < 1) continue; // Get available device IDs for this platform cl_devices = (cl_device_id*) malloc( cl_deviceCount * sizeof(cl_device_id)); checkCL( clGetDeviceIDs( cl_platformIDs[i], CL_DEVICE_TYPE_ALL, cl_deviceCount, cl_devices, NULL)); // Print platform name char platform_name[1024]; checkCL( clGetPlatformInfo( cl_platformIDs[i], CL_PLATFORM_NAME, 1024, &platform_name, NULL) ); printf("\nCompute using OpenCl platfrom #%i [ %s ]\n", i,platform_name); // Loop over devices for( cl_uint j = 0; j < cl_deviceCount; j++) { // Print device name and type cl_device_type device_type; char device_name[1024]; checkCL( clGetDeviceInfo( cl_devices[j], CL_DEVICE_NAME, 1024, &device_name, NULL) ); checkCL( clGetDeviceInfo( cl_devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL) ); printf("\n\tUsing OpenCl device #%i [ %s -- %s ]\n", j, device_name, getDevTypeString(device_type)); // Create OpenCL context cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)cl_platformIDs[i], 0 }; cl_dev_context = clCreateContext( cps, cl_deviceCount, cl_devices, NULL, NULL, &cl_err); checkCL( cl_err); // Create command queue cl_queue = clCreateCommandQueue( cl_dev_context, cl_devices[j], CL_QUEUE_PROFILING_ENABLE, &cl_err); checkCL( cl_err); // Create device buffer d_data = clCreateBuffer( cl_dev_context, CL_MEM_READ_WRITE, width*height*sizeof(cl_compl_flt), NULL, &cl_err); checkCL( cl_err); // Setup FFT checkCL( clAmdFftSetup(&fftSetupData) ); // Create FFT plan checkCL( clAmdFftCreateDefaultPlan( &fftPlan, cl_dev_context, fftDim, fftSize) ); // Copy data from host to device clEnqueueWriteBuffer( cl_queue, d_data, CL_TRUE, 0, width*height*sizeof(cl_compl_flt), h_src, 0, NULL, NULL); // Execute FFT checkCL( clAmdFftEnqueueTransform( fftPlan, CLFFT_FORWARD, 1, &cl_queue, 0, NULL, NULL, &d_data, NULL, NULL) ); clFinish( cl_queue); // Copy result from device to host checkCL( clEnqueueReadBuffer(cl_queue, d_data, CL_TRUE, 0, width*height*sizeof(cl_compl_flt), h_res, 0, NULL, NULL) ); clFinish( cl_queue); // Save result char filename[512]; sprintf( filename, "raw/result_%u_%u_in.raw",i,j); printf("\tSave result to \"%s\" ", filename); saveRawData( h_res, filename, width, height, true); printf("\n"); // Free FFT plan checkCL( clAmdFftDestroyPlan( &fftPlan) ); // Free FFT checkCL( clAmdFftTeardown() ); // Free device memory checkCL( clReleaseMemObject(d_data) ); // Release OpenCL context and queue checkCL( clReleaseCommandQueue( cl_queue ) ); checkCL( clReleaseContext( cl_dev_context) ); } // Free OpenCL devices free( cl_devices); } free( h_src); free( h_res); printf("\n\nPress any key ..."); getchar(); return 0; } 

and additional features used ...

 // Generate a pinhole void createPinholeField( cl_compl_flt* data, cl_uint width, cl_uint height, cl_uint radius) { if(data==NULL) data = (cl_compl_flt*)malloc(width*height*sizeof(cl_compl_flt)); if(radius < 1) radius = (width>height)?height/2:width/2; cl_float min_val = 0.0f; cl_float max_val = 255.0f; for(cl_uint y = 0; y < height; y++) for(cl_uint x = 0; x < width; x++) { if ( ceil( sqrt( pow(x-width/2., 2.) + pow(y-height/2., 2.) )) <= radius ) { data[x+y*width].real(max_val); data[x+y*width].imag(0.f); } else { data[x+y*width].real(min_val); data[x+y*width].imag(0.f); } } } // Save a cl_compl_flt array as an unsigned char raw image file void saveRawData( cl_compl_flt* char_array, const char* filepath, cl_uint width, cl_uint height, bool print_minmax ) { cl_float* abs_v = (cl_float*) malloc(width*height*sizeof(cl_float)); for( cl_uint i = 0; i < width*height; i++) abs_v[i] = abs(char_array[i]); cl_float min = abs_v[0]; cl_float max = abs_v[0]; for( cl_uint i = 1; i < width*height; i++) { if( abs_v[i] < min) min = abs_v[i]; if( abs_v[i] > max) max = abs_v[i]; } if( print_minmax) printf(" [min=%f , max=%f] ",min,max); max *= .01f; cl_uchar* temp = (cl_uchar*) malloc(width*height*sizeof(cl_uchar)); for( cl_uint i = 0; i < width*height; i++) temp[i] = 255*(cl_uchar)(( (cl_float)abs_v[i] - min) / ( max-min )); FILE *pFile = NULL; pFile=fopen(filepath,"wb"); fwrite(temp,1,width*height,pFile); fclose(pFile); free(abs_v); free(temp); } // Check functions that return OpenCL error IDs. bool checkCL( cl_int oclErrorCode) { if( oclErrorCode == CL_SUCCESS) return true; else { printf("\n\nAn OpenCL related error occured!\nError ID #%d\nPress ENTER to exit the program...\n\n", oclErrorCode); getchar(); exit( oclErrorCode); return false; } } // Get device type as string char* getDevTypeString(cl_device_type type) { switch(type) { case CL_DEVICE_TYPE_CPU: return "CPU"; break; case CL_DEVICE_TYPE_GPU: return "GPU"; break; case CL_DEVICE_TYPE_ACCELERATOR: return "ACCELERATOR"; break; default: return "DEFAULT"; break; } } 

Hope this helps narrow down the problem.

PS: Images can be seen here: http://devgurus.amd.com/thread/159149

+4
source share
1 answer

In the interest of moving this question from the unanswered list ...

Consistent with OpenCL FFT on Nvidia and AMD devices? , AMD OpenCL FFT should run on NVidia hardware. However, NVIDIA does not officially support this library, and I doubt that AMD also does, so I am not surprised that you will not get the correct results.

As mentioned in a comment, ArrayFire is probably a good cross-platform solution. There is a free version for uniprocessor, non-commercial use, so you can evaluate it.

+3
source

Source: https://habr.com/ru/post/1411074/


All Articles