Summing a Summing Array Using OpenCL

I calculate the Euclidean distance between n-dimensional points using OpenCL. I get two lists of n-dimensional points, and I have to return an array containing only the distances from each point in the first table to each point in the second table.

My approach is to make a normal doble cycle (for each point in table 1 {for each point in table 2 {...}}, and then do the calculation for each pair of points in parallel.

The Euclidean distance is then divided into 3 parts: 1. accept the difference between each measurement at points 2. the square of this difference (still for each measurement) 3. summarize all the values ​​obtained in 2. 4. Take the square root of the value obtained in 3 . (this step was omitted in this example.)

Everything works like a charm until I try to accumulate the sum of all the differences (namely, by completing step 3 of the above procedure, line 49 below).

As test data I use DescriptorLists for 2 points: DescriptorList1: 001,002,003, ..., 127,128; (P1) 129130131, ..., 255256; (P2)

DescriptorList2: 000,001,002, ..., 126,127; (P1) 128129130, ..., 254255; (P2)

Thus, the resulting vector must have values: 128, 2064512, 2130048, 128 Now I get random numbers that change with each run.

I appreciate any help or leads to what I'm doing wrong. Hopefully everything is clear about the scenario in which I work.

#define BLOCK_SIZE 128

typedef struct
{
    //How large each point is
    int length;
    //How many points in every list
    int num_elements;
    //Pointer to the elements of the descriptor (stored as a raw array)
    __global float *elements;
} DescriptorList;

__kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float As[BLOCK_SIZE])
{

    int gpidA = get_global_id(0);

    int featA = get_local_id(0);

    //temporary array  to store the difference between each dimension of 2 points
    float dif_acum[BLOCK_SIZE];

    //counter to track the iterations of the inner loop
    int loop = 0;

    //loop over all descriptors in A
    for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){

        //take the i-th descriptor. Returns a DescriptorList with just the i-th
        //descriptor in DescriptorList A
        DescriptorList tmpA = GetDescriptor(A, i);

        //copy the current descriptor to local memory.
        //returns one element of the only descriptor in DescriptorList tmpA
        //and index featA
        As[featA] = GetElement(tmpA, 0, featA);
        //wait for all the threads to finish copying before continuing
        barrier(CLK_LOCAL_MEM_FENCE);

        //loop over all the descriptors in B
        for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
            //take the difference of both current points
            dif_acum[featA] = As[featA]-B.elements[k*BLOCK_SIZE + featA];
            //wait again
            barrier(CLK_LOCAL_MEM_FENCE);
            //square value of the difference in dif_acum and store in C
            //which is where the results should be stored at the end.
            C[loop] = 0;
            C[loop] += dif_acum[featA]*dif_acum[featA];
            loop += 1;
            barrier(CLK_LOCAL_MEM_FENCE);
        }
    }
}
+5
source share
2 answers

:

C[loop] = 0;
C[loop] += dif_acum[featA]*dif_acum[featA];

(, , , ) - . :

  • , , , C [loop] = 0 ,
  • , , C [loop], . , ( , , , ), .

: , , :

local float* accum;
...
accum[featA] = dif_acum[featA]*dif_acum[featA];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
{
    if ((featA % (2*i)) == 0)
        accum[featA] += accum[featA + i];
    barrier(CLK_LOCAL_MEM_FENCE);
}
if(featA == 0)
    C[loop] = accum[0];

, , , (btw: , dif_acum , , - , , A ).

:

  • , , ( , , , ), .
  • , , get_local_size(0) workgroupsize, Define ( , , opencl-)
  • , , . .

, :

float As = GetElement(tmpA, 0, featA);
...
float dif_acum = As-B.elements[k*BLOCK_SIZE + featA];

( ):

__kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float accum[BLOCK_SIZE])
{
   int gpidA = get_global_id(0);
   int featA = get_local_id(0);
   int loop = 0;
   for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){
       DescriptorList tmpA = GetDescriptor(A, i);
       float As = GetElement(tmpA, 0, featA);
       for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
           float dif_acum = As-B.elements[k*BLOCK_SIZE + featA];

           accum[featA] = dif_acum[featA]*dif_acum[featA];
           barrier(CLK_LOCAL_MEM_FENCE);
           for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
           {
              if ((featA % (2*i)) == 0)
                 accum[featA] += accum[featA + i];
              barrier(CLK_LOCAL_MEM_FENCE);
           }
           if(featA == 0)
              C[loop] = accum[0];
           barrier(CLK_LOCAL_MEM_FENCE);

           loop += 1;
        }
    }
}
+7

Grizzly . , , :

IF , , , .

if(featA > BLOCK_SIZE){return;}

() (, B) , GetElement ( ).

Bs[featA] = GetElement(tmpA, 0, featA);

SCAN , , , . "" dif_acum .

dif_acum[featA] = accum[featA];

SCAN , , .

if (featA >= j && next_addend >= 0 && next_addend < BLOCK_SIZE){

, IF, .

if(featA == 0){
    C[loop] = accum[BLOCK_SIZE-1];
    loop += 1;
}

. , group_size BLOCK_SIZE , .

, , , :

__kernel void CompareDescriptors(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float accum[BLOCK_SIZE], __local float Bs[BLOCK_SIZE])
{

    int gpidA = get_global_id(0);
    int featA = get_local_id(0);

    //global counter to store final differences
    int loop = 0;

    //auxiliary buffer to store temporary data
    local float dif_acum[BLOCK_SIZE];

    //discard the threads that are not going to be used.
    if(featA > BLOCK_SIZE){
        return;
    }

    //loop over all descriptors in A
    for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){

        //take the gpidA-th descriptor
        DescriptorList tmpA = GetDescriptor(A, i);

        //copy the current descriptor to local memory
        Bs[featA] = GetElement(tmpA, 0, featA);

        //loop over all the descriptors in B
        for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
            //take the difference of both current descriptors
            dif_acum[featA] = Bs[featA]-B.elements[k*BLOCK_SIZE + featA];

            //square the values in dif_acum
            accum[featA] = dif_acum[featA]*dif_acum[featA];
            barrier(CLK_LOCAL_MEM_FENCE);

            //copy the values of accum to keep consistency once the scan procedure starts. Mostly important for the first element. Two buffers are necesarry because the scan procedure would override values that are then further read if one buffer is being used instead.
            dif_acum[featA] = accum[featA];

            //Compute the accumulated sum (a.k.a. scan)
            for(int j = 1; j < BLOCK_SIZE; j *= 2){
                int next_addend = featA-(j/2);
                if (featA >= j && next_addend >= 0 && next_addend < BLOCK_SIZE){
                    dif_acum[featA] = accum[featA] + accum[next_addend];
                }
                barrier(CLK_LOCAL_MEM_FENCE);

                //copy As to accum
                accum[featA] = GetElementArray(dif_acum, BLOCK_SIZE, featA); 
                barrier(CLK_LOCAL_MEM_FENCE);
            }

            //tell one of the threads to write the result of the scan in the array containing the results.
            if(featA == 0){
                C[loop] = accum[BLOCK_SIZE-1];
                loop += 1;
            }
            barrier(CLK_LOCAL_MEM_FENCE);

        }
    }
}
+3

All Articles