NEON XOR implementation optimization

Question

NEON XOR implementation optimization

Having tried xor a huge array uint32, I decided to use the NEON coprocessor.

I implemented two versions c:

version 1:

uint32_t xor_array_ver_1(uint32_t *array, int size)
{
    uint32x2_t acc = vmov_n_u32(0);
    uint32_t acc1 = 0;
    for (; size != 0; size -= 2) {
        uint32x2_t vec;
        vec = vld1_u32(array);
        array += 2;
        acc = veor_u32(acc, vec);
    }
    acc1 = vget_lane_u32(acc,0) ^ vget_lane_u32(acc,1);
    return acc1;
}

version 2:

uint32_t xor_array_ver_2(uint32_t *array, int size)
{
    uint32x4_t acc = vmovq_n_u32(0);
    uint32_t acc1 = 0;

    for (; size != 0; size -= 4) {
        uint32x4_t vec;
        vec = vld1q_u32(array);
        array += 4;
        acc = veorq_u32(acc, vec);
    }

    acc1 ^= vgetq_lane_u32(acc,0);
    acc1 ^= vgetq_lane_u32(acc,1);
    acc1 ^= vgetq_lane_u32(acc,2);
    acc1 ^= vgetq_lane_u32(acc,3);

    return acc1;
}

Comparing the above versions with the traditional xor implementation:

for (i=0; i<arr_size; i++)
        val ^= my_array[i];

I noticed 2 questions:

Version 1 has the same performance.
Version 2 has more bits of 30% .

Can I rewrite it even better? where is my_arraydeclared as uint32_t my_array[BIG_LENGTH];
Is there a non-NEON way that I can improve the performance of regular xoring code? expanding the loop does not give any improvement.

+4

optimization c arm cpu-cache neon

0x90 Oct 3 '13 at 15:21

source share

4

, gcc . , , asm , 30% c. , . intrinsics asm - armcc ( ), .

, c (-):

for (i=arr_size; i<arr_size; i -= 4)
{
    val1 ^= my_array[0];
    val2 ^= my_array[1];
    val1 ^= my_array[2];
    val2 ^= my_array[3];
    my_array += 4;
}

- . , neon asm, ( , ).

NEON asm ( , , , )

//data has to be suitably aligned (it has to be 8 or 16 byte aligned, not sure).
//dataSize in bytes has to be multiple of 64 and has to be at least 128.
//function does xor of uint32_t values and returns the result.
unsigned xor_array_64(const void *data, int dataSize);

xor_array_64:
      vldm r0!,{d0-d7}
      subs r1,r1,#0x40
0:
      pld [r0, #0xC0]
      vldm r0!,{d16-d23}
      veor q0, q0, q8
      veor q1, q1, q9
      veor q2, q2, q10
      veor q3, q3, q11
      subs r1,r1,#0x40
      bge 0b

      veor q0, q0, q1
      veor q2, q2, q3
      veor q0, q0, q2
      veor d0, d0, d1

      vtrn.32 d1, d0
      veor d0, d0, d1

      vmov r0, s0
      bx lr

+2

Pavel 03 . '13 15:50

- .

, ? ? ? , , . , , , . , L1, L2 /, , .

Compiler

? , , ? , . ? (gcc: -O3), (gcc: -ftree-vectorize -ftree-vectorizer-verbose = 1)? (-mcpu -mfpu)?

, ? .

Tweaks

, ?

? (, , , 2 4, , , % 30.)

?

, ? , Cortex-A9 "" . ?

, - "" " ".

+2

auselen 04 . '13 8:59

ARM, NEON, , ARM NEON, , , ....

Paul R , , , , .....

uint32_t xor_array_ver_2(uint32_t *array, int size)
{
  // Caveat:  'size' must be a positive multiple of 4, otherwise this
  //          code will loop for a very long time... and almost certainly
  //          segfault (or whatever term your system uses).

  uint32x4_t acc = vmovq_n_u32(0);
  uint32x4_t next_vec = vld1q_u32(array);
  uint32_t acc1 = 0;

  for (size-=4, array+=4; size != 0; size-=4) {
     uint32x4_t vec = next_vec;
     array += 4;
     next_vec = vld1q_u32(array);
     acc = veorq_u32(acc, vec);
  }
  acc = veorq_u32(acc, next_vec);

  acc1 ^= vgetq_lane_u32(acc,0);
  acc1 ^= vgetq_lane_u32(acc,1);
  acc1 ^= vgetq_lane_u32(acc,2);
  acc1 ^= vgetq_lane_u32(acc,3);

  return acc1;
}

.... , .

, :

uint32_t xor_array_ver_2(uint32_t *array, int size)
{
  // Caveat:  'size' must be a positive multiple of 4, otherwise this
  //          code will loop for a very long time... and almost certainly
  //          segfault (or whatever term your system uses).

  uint32x4_t acc = vmovq_n_u32(0);
  uint32x4_t next_vec = vld1q_u32(&array[size-4]);
  uint32_t acc1 = 0;

  for (size-=8; size>=0; size-=4) {
     uint32x4_t vec = next_vec;
     next_vec = vld1q_u32(&array[size]);
     acc = veorq_u32(acc, vec);
  }
  acc = veorq_u32(acc, next_vec);

  acc1 ^= vgetq_lane_u32(acc,0);
  acc1 ^= vgetq_lane_u32(acc,1);
  acc1 ^= vgetq_lane_u32(acc,2);
  acc1 ^= vgetq_lane_u32(acc,3);

  return acc1;
}

+1

phonetagger 03 . '13 18:52

Paul R · Accepted Answer · 2013-10-03T16:18:17+0000

, - DRAM, ALU , .

XOR , - .

NEON XOR implementation optimization

Compiler

Tweaks

More articles: