AVX2 byte integrates with uint16 indices in __m256i

I am trying to pack a __m256i variable with 32 characters from an array and specify indexes. here is my code:

char array[];         // different array every time.
uint16_t offset[32];  // same offset reused many times


_mm256_set_epi8(array[offset[0]], array[offset[1]], array[offset[2]], array[offset[3]], array[offset[4]], array[offset[5]], array[offset[6]], array[offset[7]],
      array[offset[8]],array[offset[9]],array[offset[10]],array[offset[11]], array[offset[12]], array[offset[13]], array[offset[14]], array[offset[15]], 
      array[offset[16]],array[offset[17]], array[offset[18]], array[offset[19]], array[offset[20]], array[offset[21]], array[offset[22]], array[offset[23]], 
      array[offset[24]],array[offset[25]],array[offset[26]], array[offset[27]], array[offset[28]], array[offset[29]], array[offset[30]],array[offset[31]])

This function will be called many times with the same offsets and different arrays. But I do not think that this is optimal according to my test. Is there any idea to improve it?

+6
source share
1 answer

, offset, ( ), , , offset ( array ).

, , gcc ( ) :

  lea r10, [rsp+8]
  and rsp, -32
  push QWORD PTR [r10-8]
  push rbp
  mov rbp, rsp
  push r15
  push r14
  push r13
  push r12
  push r10
  push rbx
  sub rsp, 40
  movzx eax, WORD PTR [rsi+40]
  movzx r14d, WORD PTR [rsi+60]
  movzx r12d, WORD PTR [rsi+56]
  movzx ecx, WORD PTR [rsi+44]
  movzx r15d, WORD PTR [rsi+62]
  movzx r13d, WORD PTR [rsi+58]
  mov QWORD PTR [rbp-56], rax
  movzx eax, WORD PTR [rsi+38]
  movzx ebx, WORD PTR [rsi+54]
  movzx r11d, WORD PTR [rsi+52]
  movzx r10d, WORD PTR [rsi+50]
  movzx r9d, WORD PTR [rsi+48]
  movzx r8d, WORD PTR [rsi+46]
  mov QWORD PTR [rbp-64], rax
  movzx eax, WORD PTR [rsi+36]
  movzx edx, WORD PTR [rsi+42]
  mov QWORD PTR [rbp-72], rax
  movzx eax, WORD PTR [rsi+34]
  mov QWORD PTR [rbp-80], rax
  movzx eax, WORD PTR [rsi+32]
  mov QWORD PTR [rbp-88], rax
  movzx eax, WORD PTR [rsi+30]
  movzx r15d, BYTE PTR [rdi+r15]
  mov QWORD PTR [rbp-96], rax
  movzx eax, WORD PTR [rsi+28]
  vmovd xmm2, r15d
  vpinsrb xmm2, xmm2, BYTE PTR [rdi+r14], 1
  mov QWORD PTR [rbp-104], rax
  movzx eax, WORD PTR [rsi+26]
  mov QWORD PTR [rbp-112], rax
  movzx eax, WORD PTR [rsi+24]
  mov QWORD PTR [rbp-120], rax
  movzx eax, WORD PTR [rsi+22]
  mov QWORD PTR [rbp-128], rax
  movzx eax, WORD PTR [rsi+20]
  mov QWORD PTR [rbp-136], rax
  movzx eax, WORD PTR [rsi+18]
  mov QWORD PTR [rbp-144], rax
  movzx eax, WORD PTR [rsi+16]
  mov QWORD PTR [rbp-152], rax
  movzx eax, WORD PTR [rsi+14]
  mov QWORD PTR [rbp-160], rax
  movzx eax, WORD PTR [rsi+12]
  mov QWORD PTR [rbp-168], rax
  movzx eax, WORD PTR [rsi+10]
  mov QWORD PTR [rbp-176], rax
  movzx eax, WORD PTR [rsi+8]
  mov QWORD PTR [rbp-184], rax
  movzx eax, WORD PTR [rsi+6]
  mov QWORD PTR [rbp-192], rax
  movzx eax, WORD PTR [rsi+4]
  mov QWORD PTR [rbp-200], rax
  movzx eax, WORD PTR [rsi+2]
  movzx esi, WORD PTR [rsi]
  movzx r13d, BYTE PTR [rdi+r13]
  movzx r8d, BYTE PTR [rdi+r8]
  movzx edx, BYTE PTR [rdi+rdx]
  movzx ebx, BYTE PTR [rdi+rbx]
  movzx r10d, BYTE PTR [rdi+r10]
  vmovd xmm7, r13d
  vmovd xmm1, r8d
  vpinsrb xmm1, xmm1, BYTE PTR [rdi+rcx], 1
  mov rcx, QWORD PTR [rbp-56]
  vmovd xmm5, edx
  vmovd xmm3, ebx
  mov rbx, QWORD PTR [rbp-72]
  vmovd xmm6, r10d
  vpinsrb xmm7, xmm7, BYTE PTR [rdi+r12], 1
  vpinsrb xmm5, xmm5, BYTE PTR [rdi+rcx], 1
  mov rcx, QWORD PTR [rbp-64]
  vpinsrb xmm6, xmm6, BYTE PTR [rdi+r9], 1
  vpinsrb xmm3, xmm3, BYTE PTR [rdi+r11], 1
  vpunpcklwd xmm2, xmm2, xmm7
  movzx edx, BYTE PTR [rdi+rcx]
  mov rcx, QWORD PTR [rbp-80]
  vpunpcklwd xmm1, xmm1, xmm5
  vpunpcklwd xmm3, xmm3, xmm6
  vmovd xmm0, edx
  movzx edx, BYTE PTR [rdi+rcx]
  mov rcx, QWORD PTR [rbp-96]
  vpunpckldq xmm2, xmm2, xmm3
  vpinsrb xmm0, xmm0, BYTE PTR [rdi+rbx], 1
  mov rbx, QWORD PTR [rbp-88]
  vmovd xmm4, edx
  movzx edx, BYTE PTR [rdi+rcx]
  mov rcx, QWORD PTR [rbp-112]
  vpinsrb xmm4, xmm4, BYTE PTR [rdi+rbx], 1
  mov rbx, QWORD PTR [rbp-104]
  vpunpcklwd xmm0, xmm0, xmm4
  vpunpckldq xmm0, xmm1, xmm0
  vmovd xmm1, edx
  movzx edx, BYTE PTR [rdi+rcx]
  vpinsrb xmm1, xmm1, BYTE PTR [rdi+rbx], 1
  mov rcx, QWORD PTR [rbp-128]
  mov rbx, QWORD PTR [rbp-120]
  vpunpcklqdq xmm0, xmm2, xmm0
  vmovd xmm8, edx
  movzx edx, BYTE PTR [rdi+rcx]
  vpinsrb xmm8, xmm8, BYTE PTR [rdi+rbx], 1
  mov rcx, QWORD PTR [rbp-144]
  mov rbx, QWORD PTR [rbp-136]
  vmovd xmm4, edx
  vpunpcklwd xmm1, xmm1, xmm8
  vpinsrb xmm4, xmm4, BYTE PTR [rdi+rbx], 1
  movzx edx, BYTE PTR [rdi+rcx]
  mov rbx, QWORD PTR [rbp-152]
  mov rcx, QWORD PTR [rbp-160]
  vmovd xmm7, edx
  movzx eax, BYTE PTR [rdi+rax]
  movzx edx, BYTE PTR [rdi+rcx]
  vpinsrb xmm7, xmm7, BYTE PTR [rdi+rbx], 1
  mov rcx, QWORD PTR [rbp-176]
  mov rbx, QWORD PTR [rbp-168]
  vmovd xmm5, eax
  vmovd xmm2, edx
  vpinsrb xmm5, xmm5, BYTE PTR [rdi+rsi], 1
  vpunpcklwd xmm4, xmm4, xmm7
  movzx edx, BYTE PTR [rdi+rcx]
  vpinsrb xmm2, xmm2, BYTE PTR [rdi+rbx], 1
  vpunpckldq xmm1, xmm1, xmm4
  mov rbx, QWORD PTR [rbp-184]
  mov rcx, QWORD PTR [rbp-192]
  vmovd xmm6, edx
  movzx edx, BYTE PTR [rdi+rcx]
  vpinsrb xmm6, xmm6, BYTE PTR [rdi+rbx], 1
  mov rbx, QWORD PTR [rbp-200]
  vmovd xmm3, edx
  vpunpcklwd xmm2, xmm2, xmm6
  vpinsrb xmm3, xmm3, BYTE PTR [rdi+rbx], 1
  add rsp, 40
  vpunpcklwd xmm3, xmm3, xmm5
  vpunpckldq xmm2, xmm2, xmm3
  pop rbx
  pop r10
  vpunpcklqdq xmm1, xmm1, xmm2
  pop r12
  pop r13
  vinserti128 ymm0, ymm0, xmm1, 0x1
  pop r14
  pop r15
  pop rbp
  lea rsp, [r10-8]
  ret

, offset , , , , 16- offset, ( 64- ) . , offset ( 64 ) : , , , offset.

4.9.2, , 7.2.


icc clang - , offset movzx, vpinsrb offset :

gather256(char*, unsigned short*): # @gather256(char*, unsigned short*)
  movzx eax, word ptr [rsi + 30]
  movzx eax, byte ptr [rdi + rax]
  vmovd xmm0, eax
  movzx eax, word ptr [rsi + 28]
  vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 1
  movzx eax, word ptr [rsi + 26]
  vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 2
  movzx eax, word ptr [rsi + 24]
  ...
  vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 14
  movzx eax, word ptr [rsi]
  vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 15
  movzx eax, word ptr [rsi + 62]
  movzx eax, byte ptr [rdi + rax]
  vmovd xmm1, eax
  movzx eax, word ptr [rsi + 60]
  vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 1
  movzx eax, word ptr [rsi + 58]
  vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 2
  movzx eax, word ptr [rsi + 56]
  vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 3
  movzx eax, word ptr [rsi + 54]
  vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 4
  movzx eax, word ptr [rsi + 52]
  ...
  movzx eax, word ptr [rsi + 32]
  vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 15
  vinserti128 ymm0, ymm1, xmm0, 1
  ret

. vinserti128 xmm , -, , vpinserb 128 . , , , , 2 ​​ 5 () 1 . , , , 1 32 , 32 ( xmm , pinsrb, 1 1.

32 gcc? . :

uint64_t gather64(char *array, uint16_t *offset) {
  uint64_t ret;
  char *p = (char *)&ret;
  p[0] = array[offset[0]];
  p[1] = array[offset[1]];
  p[2] = array[offset[2]];
  p[3] = array[offset[3]];
  p[4] = array[offset[4]];
  p[5] = array[offset[5]];
  p[6] = array[offset[6]];
  p[7] = array[offset[7]];
  return ret;
}

__m256i gather256_gcc(char *array, uint16_t *offset) {

  return _mm256_set_epi64x(
    gather64(array, offset),
    gather64(array +  8, offset + 8),
    gather64(array + 16, offset + 16),
    gather64(array + 24, offset + 24)
  );
}

8 array , _mm256_set_epi64x. 2 1 8- 64- , 1 2.

"" inline code gcc:

gather256_gcc(char*, unsigned short*):
  lea r10, [rsp+8]
  and rsp, -32
  push QWORD PTR [r10-8]
  push rbp
  mov rbp, rsp
  push r10
  movzx eax, WORD PTR [rsi+48]
  movzx eax, BYTE PTR [rdi+24+rax]
  mov BYTE PTR [rbp-24], al
  movzx eax, WORD PTR [rsi+50]
  movzx eax, BYTE PTR [rdi+24+rax]
  mov BYTE PTR [rbp-23], al
  movzx eax, WORD PTR [rsi+52]
  movzx eax, BYTE PTR [rdi+24+rax]
  mov BYTE PTR [rbp-22], al
  ...
  movzx eax, WORD PTR [rsi+62]
  movzx eax, BYTE PTR [rdi+24+rax]
  mov BYTE PTR [rbp-17], al
  movzx eax, WORD PTR [rsi+32]
  vmovq xmm0, QWORD PTR [rbp-24]
  movzx eax, BYTE PTR [rdi+16+rax]
  movzx edx, WORD PTR [rsi+16]
  mov BYTE PTR [rbp-24], al
  movzx eax, WORD PTR [rsi+34]
  movzx edx, BYTE PTR [rdi+8+rdx]
  movzx eax, BYTE PTR [rdi+16+rax]
  mov BYTE PTR [rbp-23], al
  ...
  movzx eax, WORD PTR [rsi+46]
  movzx eax, BYTE PTR [rdi+16+rax]
  mov BYTE PTR [rbp-17], al
  mov rax, QWORD PTR [rbp-24]
  mov BYTE PTR [rbp-24], dl
  movzx edx, WORD PTR [rsi+18]
  vpinsrq xmm0, xmm0, rax, 1
  movzx edx, BYTE PTR [rdi+8+rdx]
  mov BYTE PTR [rbp-23], dl
  movzx edx, WORD PTR [rsi+20]
  movzx edx, BYTE PTR [rdi+8+rdx]
  mov BYTE PTR [rbp-22], dl
  movzx edx, WORD PTR [rsi+22]
  movzx edx, BYTE PTR [rdi+8+rdx]
  mov BYTE PTR [rbp-21], dl
  movzx edx, WORD PTR [rsi+24]
  movzx edx, BYTE PTR [rdi+8+rdx]
  mov BYTE PTR [rbp-20], dl
  movzx edx, WORD PTR [rsi+26]
  movzx edx, BYTE PTR [rdi+8+rdx]
  mov BYTE PTR [rbp-19], dl
  movzx edx, WORD PTR [rsi+28]
  movzx edx, BYTE PTR [rdi+8+rdx]
  mov BYTE PTR [rbp-18], dl
  movzx edx, WORD PTR [rsi+30]
  movzx edx, BYTE PTR [rdi+8+rdx]
  mov BYTE PTR [rbp-17], dl
  movzx edx, WORD PTR [rsi]
  vmovq xmm1, QWORD PTR [rbp-24]
  movzx edx, BYTE PTR [rdi+rdx]
  mov BYTE PTR [rbp-24], dl
  movzx edx, WORD PTR [rsi+2]
  movzx edx, BYTE PTR [rdi+rdx]
  mov BYTE PTR [rbp-23], dl
  movzx edx, WORD PTR [rsi+4]
  movzx edx, BYTE PTR [rdi+rdx]
  mov BYTE PTR [rbp-22], dl
  ...
  movzx edx, WORD PTR [rsi+12]
  movzx edx, BYTE PTR [rdi+rdx]
  mov BYTE PTR [rbp-18], dl
  movzx edx, WORD PTR [rsi+14]
  movzx edx, BYTE PTR [rdi+rdx]
  mov BYTE PTR [rbp-17], dl
  vpinsrq xmm1, xmm1, QWORD PTR [rbp-24], 1
  vinserti128 ymm0, ymm0, xmm1, 0x1
  pop r10
  pop rbp
  lea rsp, [r10-8]
  ret

4 () , , 32 , , 40- ( , , , ). gather64 32- ​​. 64- , , , -, .

, , . , " " vpinsrb, clang icc. , gcc .


, offset array?

offset, .

vgatherdd 5 . 256- . , vpgather vpgatherdd, 8 32- , 32- . , 4 32 , - .

, "" 32- . , 4 256- , 8 , 24 , . vpblendw , vpblendb , 3 5 uops ( ?).

, - :

  • 4 movups, 4 vpgatherdd reg regs ( )
  • 4 vpgatherdd
  • 2 vpblendw (4 → 2)
  • 1 movups vpblendb ( )
  • 1 vpblendb (2 → 1)

vpgatherdd, 9 , 3 5, 3 2.25 , ( vpgatherdd 5). Broadwell vpgather , 0,9 vpgatherdd, 29 . , , , 32 .

, :

  • 0,9 vpgatherdd. , , 29 (, movups ).
  • vpgatherdd Skylake, 0.6 , Skylake. ( vpinsrb AVX512BW, k -register, vpgatherdd zmm , ymm (InstLatx64).)
  • , array. . , offset , 16 , pshufb . "" , ( ), pshufb , .

: , , 1, 2, 3 4 . , ( , ) . .

- , 4 , : 3 offset, . , , pshufb , .


1 SSE/AVX, , reg-reg: reg-reg 2 uops 5, 0,5 2. , /, 5. vpbroadcastd/q .

2 : L1, : , L2.

+3

All Articles