OpenCL Total Reduction

I would like to apply a reduction to this part of the kernel code (1 dimensional data):

__local float sum = 0;
int i;
for(i = 0; i < length; i++)
  sum += //some operation depending on i here;

Instead of having only one thread that performs this operation, I would like to have n threads (with n = length) and at the end have 1 thread to make the total.

In pseudo code, I would like to write something like this:

int i = get_global_id(0);
__local float sum = 0;
sum += //some operation depending on i here;
barrier(CLK_LOCAL_MEM_FENCE);
if(i == 0)
  res = sum;

Is there any way?

I have a race condition for the amount.

+4
source share
4 answers

To get started, you can do something like the example below ( see Scarpino ). Here we also use vector processing using the OpenCL float4 data type.

, ​​ : , . , , . , ( OpenCL 1.2) , .

, , . , . , -, , , , ( Scarpino).

EDIT: . float 4.

__kernel void reduction_vector(__global float4* data,__local float4* partial_sums, __global float* output) 
{
    int lid = get_local_id(0);
    int group_size = get_local_size(0);
    partial_sums[lid] = data[get_global_id(0)];
    barrier(CLK_LOCAL_MEM_FENCE);

    for(int i = group_size/2; i>0; i >>= 1) {
        if(lid < i) {
            partial_sums[lid] += partial_sums[lid + i];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    if(lid == 0) {
        output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
    }
}
+7

, , , , , - .

, , . , , .

, - , C/C++, , , GPU. , , , .

. , , , .

, , ( ), , .

, , . ** , .

. "AOffset" , . , ...

__kernel void Sum(__global float * A, __global float *output, ulong AOffset, __local float * target ) {
        const size_t globalId = get_global_id(0);
        const size_t localId = get_local_id(0);
        target[localId] = A[globalId+AOffset];

        barrier(CLK_LOCAL_MEM_FENCE);
        size_t blockSize = get_local_size(0);
        size_t halfBlockSize = blockSize / 2;
        while (halfBlockSize>0) {
            if (localId<halfBlockSize) {
                target[localId] += target[localId + halfBlockSize];
                if ((halfBlockSize*2)<blockSize) { // uneven block division
                    if (localId==0) { // when localID==0
                        target[localId] += target[localId + (blockSize-1)];
                    }
                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            blockSize = halfBlockSize;
            halfBlockSize = blockSize / 2;
        }
        if (localId==0) {
            output[get_group_id(0)] = target[0];
        }
    }

https://pastebin.com/xN4yQ28N

+1

.

, CL:

__kernel void foldKernel(__global float *arVal, int offset) {
    int gid = get_global_id(0);
    arVal[gid] = arVal[gid]+arVal[gid+offset];
}

- Java/JOCL ( ++ ..):

    int t = totalDataSize;
    while (t > 1) {
        int m = t / 2;
        int n = (t + 1) / 2;
        clSetKernelArg(kernelFold, 0, Sizeof.cl_mem, Pointer.to(arVal));
        clSetKernelArg(kernelFold, 1, Sizeof.cl_int, Pointer.to(new int[]{n}));
        cl_event evFold = new cl_event();
        clEnqueueNDRangeKernel(commandQueue, kernelFold, 1, null, new long[]{m}, null, 0, null, evFold);
        clWaitForEvents(1, new cl_event[]{evFold});
        t = n;
    }

- log2 (n) , . "m" "n" .

  • OpenCL GPU ( ).
  • ,
  • , .
  • , . , "min" "+"
0

work_group_reduce_add() , OpenCL C 2.0

0
source

All Articles