Implement this problem in 2D and start your kernel using two-dimensional blocks. The total number of threads in sizes x and y will be equal to total . The kernel code should look like this:
__global__ void calc(float *values, float *newvalues, int total){ float a,b,c; int n= blockIdy.y * blockDim.y + threadIdx.y; int i= blockIdx.x * blockDim.x + threadIdx.x; if (n>=total || i>=total) return; a = values[n]; b = values[i] - a; c = b*b; if( c < 10) newvalues[i] = c;
Update:
This is how you should call the kernel
dim3 block(16,16); dim3 grid ( (total+15)/16, (total+15)/16 ); calc<<<grid,block>>>(float *val, float *newval, int T);
Also make sure you add this line to the kernel (see updated kernel)
if (n>=total || i>=total) return;
jwdmsd
source share