Can CUDA calculate the value of such a function fast enough?
Many value functionals are expressed as a summation of a certain number of members. Examples are:
Sphere function

Rosenbrock Function

Styblinski-Tang Function

In all these cases, the valuation of the cost function can be carried out by reduction or, better, transformation, followed by reduction.
CUDA Thrust has thrust::transform_reduce , which can certainly serve the field, but of course you can customize your own conversion + reduction procedures.
Below I will talk about how you can calculate the functionality of Rosenbrock using either CUDA Thrust or the individual version of the reduction procedure offered by CUDA examples. In the latter case, the pointer to the __device__ transform __device__ is passed to the transform_reduce custom function if the EXTERNAL keyword is defined, or the transform function is defined and compiled in the custom transform_reduce compilation module.
Some results of work on the Kepler K20c card for a case other than EXTERNAL:
N = 90000 Thrust = 0.055ms Customized = 0.059ms N = 900000 Thrust = 0.67ms Customized = 0.14ms N = 9000000 Thrust = 0.85ms Customized = 0.87ms
Here is the code. For synchronization features, see OrangeOwlSolutions / Timing and OrangeOwlSolutions / CUDA_Utilities github Projects.
Please note that separate compilation is required.
kernel.cu
// --- Requires separate compilation #include <stdio.h> #include <thrust/device_vector.h> #include "transform_reduce.cuh" #include "Utilities.cuh" #include "TimingCPU.h" #include "TimingGPU.cuh" /***************************************/ /* COST FUNCTION - GPU/CUSTOMIZED CASE */ /***************************************/ // --- Transformation function __device__ __forceinline__ float transformation(const float * __restrict__ x, const int i) { return (100.f * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - 1.f) * (x[i] - 1.f)) ; } // --- Device-side function pointer __device__ pointFunction_t dev_pfunc = transformation; /***********************************/ /* COST FUNCTION - GPU/THRUST CASE */ /***********************************/ struct CostFunctionStructGPU{ template <typename Tuple> __host__ __device__ float operator()(Tuple a) { float temp1 = (thrust::get<1>(a) - thrust::get<0>(a) * thrust::get<0>(a)); float temp2 = (thrust::get<0>(a) - 1.f); return 100.f * temp1 * temp1 + temp2 * temp2; } }; /********/ /* MAIN */ /********/ int main() { const int N = 90000000; float *x = (float *)malloc(N * sizeof(float)); for (int i=0; i<N; i++) x[i] = 3.f; float *d_x; gpuErrchk(cudaMalloc((void**)&d_x, N * sizeof(float))); gpuErrchk(cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice)); /************************************************/ /* "CUSTOMIZED" DEVICE-SIDE TRANSFORM REDUCTION */ /************************************************/ float customizedDeviceResult = transform_reduce(d_x, N - 1, &dev_pfunc); TimingGPU timerGPU; timerGPU.StartCounter(); customizedDeviceResult = transform_reduce(d_x, N - 1, &dev_pfunc); printf("Timing for 'customized', device-side transform reduction = %f\n", timerGPU.GetCounter()); printf("Result for 'customized', device-side transform reduction = %f\n", customizedDeviceResult); printf("\n\n"); /************************************************/ /* THRUST-BASED DEVICE-SIDE TRANSFORM REDUCTION */ /************************************************/ thrust::device_vector<float> d_vec(N,3.f); timerGPU.StartCounter(); float ThrustResult = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(d_vec.begin(), d_vec.begin() + 1)), thrust::make_zip_iterator(thrust::make_tuple(d_vec.begin() + N - 1, d_vec.begin() + N)), CostFunctionStructGPU(), 0.f, thrust::plus<float>()); printf("Timing for Thrust-based, device-side transform reduction = %f\n", timerGPU.GetCounter()); printf("Result for Thrust-based, device-side transform reduction = %f\n", ThrustResult); printf("\n\n"); /*********************************/ /* HOST-SIDE TRANSFORM REDUCTION */ /*********************************/ // thrust::host_vector<float> h_vec(d_vec); //sum_host = sum_host + transformation(thrust::raw_pointer_cast(h_vec.data()), i); TimingCPU timerCPU; timerCPU.StartCounter(); float sum_host = 0.f; for (int i=0; i<N-1; i++) { float temp = (100.f * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - 1.f) * (x[i] - 1.f)); sum_host = sum_host + temp; //printf("%i %f %f\n", i, temp, sum_host); } printf("Timing for host-side transform reduction = %f\n", timerCPU.GetCounter()); printf("Result for host-side transform reduction = %f\n", sum_host); printf("\n\n"); sum_host = 0.f; float c = 0.f; for (int i=0; i<N-1; i++) { float temp = (100.f * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - 1.f) * (x[i] - 1.f)) - c; float t = sum_host + temp; c = (t - sum_host) - temp; sum_host = t; } printf("Result for host-side transform reduction = %f\n", sum_host); // cudaDeviceReset(); }
transform_reduce.cuh
#ifndef TRANSFORM_REDUCE_CUH #define TRANSFORM_REDUCE_CUH
transform_reduce.cu
#include <stdio.h> #include "Utilities.cuh" #include "transform_reduce.cuh" #define BLOCKSIZE 512 #define warpSize 32 // --- Host-side function pointer pointFunction_t h_pfunc; // --- Uncomment if you want to apply CUDA error checking to the kernel launches //#define DEBUG //#define EXTERNAL /*******************************************************/ /* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */ /*******************************************************/ unsigned int nextPow2(unsigned int x) { --x; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; return ++x; } /*************************************/ /* CHECK IF A NUMBER IS A POWER OF 2 */ /*************************************/ // --- Note: although x = 1 is a power of 2 (1 = 2^0), this routine returns 0 for x == 1. bool isPow2(unsigned int x) { if (x == 1) return 0; else return ((x&(x-1))==0); } /***************************/ /* TRANSFORMATION FUNCTION */ /***************************/ template <class T> __host__ __device__ __forceinline__ T transformation(const T * __restrict__ x, const int i) { return ((T)100 * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - (T)1) * (x[i] - (T)1)) ; } /********************/ /* REDUCTION KERNEL */ /********************/ /* This version adds multiple elements per thread sequentially. This reduces the overall cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). (Brent Theorem optimization) Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory. In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. If blockSize > 32, allocate blockSize*sizeof(T) bytes. */ template <class T, unsigned int blockSize, bool nIsPow2> __global__ void reductionKernel(T *g_idata, T *g_odata, unsigned int N, pointFunction_t pPointTransformation) { extern __shared__ T sdata[]; unsigned int tid = threadIdx.x; // Local thread index unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; // Global thread index - Fictitiously double the block dimension unsigned int gridSize = blockSize*2*gridDim.x; // --- Performs the first level of reduction in registers when reading from global memory on multiple elements per thread. // More blocks will result in a larger gridSize and therefore fewer elements per thread T mySum = 0; while (i < N) { #ifdef EXTERNAL mySum += (*pPointTransformation)(g_idata, i); #else mySum += transformation(g_idata, i); #endif // --- Ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays if (nIsPow2 || i + blockSize < N) #ifdef EXTERNAL mySum += (*pPointTransformation)(g_idata, i+blockSize); #else mySum += transformation(g_idata, i+blockSize); #endif i += gridSize; } // --- Each thread puts its local sum into shared memory sdata[tid] = mySum; __syncthreads(); // --- Reduction in shared memory. Fully unrolled loop. if ((blockSize >= 512) && (tid < 256)) sdata[tid] = mySum = mySum + sdata[tid + 256]; __syncthreads(); if ((blockSize >= 256) && (tid < 128)) sdata[tid] = mySum = mySum + sdata[tid + 128]; __syncthreads(); if ((blockSize >= 128) && (tid < 64)) sdata[tid] = mySum = mySum + sdata[tid + 64]; __syncthreads(); #if (__CUDA_ARCH__ >= 300 ) // --- Single warp reduction by shuffle operations if ( tid < 32 ) { // --- Last iteration removed from the for loop, but needed for shuffle reduction mySum += sdata[tid + 32]; // --- Reduce final warp using shuffle for (int offset = warpSize/2; offset > 0; offset /= 2) mySum += __shfl_down(mySum, offset); //for (int offset=1; offset < warpSize; offset *= 2) mySum += __shfl_xor(mySum, i); } #else // --- Reduction within a single warp. Fully unrolled loop. if ((blockSize >= 64) && (tid < 32)) sdata[tid] = mySum = mySum + sdata[tid + 32]; __syncthreads(); if ((blockSize >= 32) && (tid < 16)) sdata[tid] = mySum = mySum + sdata[tid + 16]; __syncthreads(); if ((blockSize >= 16) && (tid < 8)) sdata[tid] = mySum = mySum + sdata[tid + 8]; __syncthreads(); if ((blockSize >= 8) && (tid < 4)) sdata[tid] = mySum = mySum + sdata[tid + 4]; __syncthreads(); if ((blockSize >= 4) && (tid < 2)) sdata[tid] = mySum = mySum + sdata[tid + 2]; __syncthreads(); if ((blockSize >= 2) && ( tid < 1)) sdata[tid] = mySum = mySum + sdata[tid + 1]; __syncthreads(); #endif // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of // individual blocks if (tid == 0) g_odata[blockIdx.x] = mySum; } /******************************/ /* REDUCTION WRAPPER FUNCTION */ /******************************/ template <class T> T transform_reduce_inner(T *g_idata, unsigned int N, pointFunction_t h_pfunc) { // --- Reduction parameters const int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE; const int NumBlocks = (N + NumThreads - 1) / NumThreads; const int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(T) : NumThreads * sizeof(T); // --- Device memory space where storing the partial reduction results T *g_odata; gpuErrchk(cudaMalloc((void**)&g_odata, NumBlocks * sizeof(T))); if (isPow2(N)) { switch (NumThreads) { case 512: reductionKernel<T, 512, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 256: reductionKernel<T, 256, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 128: reductionKernel<T, 128, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 64: reductionKernel<T, 64, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 32: reductionKernel<T, 32, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 16: reductionKernel<T, 16, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 8: reductionKernel<T, 8, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 4: reductionKernel<T, 4, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 2: reductionKernel<T, 2, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 1: reductionKernel<T, 1, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; } #ifdef DEBUG gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); #endif } else { switch (NumThreads) { case 512: reductionKernel<T, 512, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 256: reductionKernel<T, 256, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 128: reductionKernel<T, 128, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 64: reductionKernel<T, 64, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 32: reductionKernel<T, 32, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 16: reductionKernel<T, 16, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 8: reductionKernel<T, 8, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 4: reductionKernel<T, 4, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 2: reductionKernel<T, 2, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; case 1: reductionKernel<T, 1, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break; } #ifdef DEBUG gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); #endif } // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host T *host_vector = (T *)malloc(NumBlocks * sizeof(T)); gpuErrchk(cudaMemcpy(host_vector, g_odata, NumBlocks * sizeof(T), cudaMemcpyDeviceToHost)); T sum_transformReduce = (T)0; for (int i=0; i<NumBlocks; i++) sum_transformReduce = sum_transformReduce + host_vector[i]; return sum_transformReduce; } template <class T> T transform_reduce(T *g_idata, unsigned int N, pointFunction_t *dev_pfunc) { #ifdef EXTERNAL gpuErrchk(cudaMemcpyFromSymbol(&h_pfunc, *dev_pfunc, sizeof(pointFunction_t))); #endif T customizedDeviceResult = transform_reduce_inner(g_idata, N, h_pfunc); return customizedDeviceResult; } // --- Complete with your own favourite instantiations template float transform_reduce(float *, unsigned int, pointFunction_t *);