, offset, ( ), , , offset ( array ).
, , gcc ( ) :
lea r10, [rsp+8]
and rsp, -32
push QWORD PTR [r10-8]
push rbp
mov rbp, rsp
push r15
push r14
push r13
push r12
push r10
push rbx
sub rsp, 40
movzx eax, WORD PTR [rsi+40]
movzx r14d, WORD PTR [rsi+60]
movzx r12d, WORD PTR [rsi+56]
movzx ecx, WORD PTR [rsi+44]
movzx r15d, WORD PTR [rsi+62]
movzx r13d, WORD PTR [rsi+58]
mov QWORD PTR [rbp-56], rax
movzx eax, WORD PTR [rsi+38]
movzx ebx, WORD PTR [rsi+54]
movzx r11d, WORD PTR [rsi+52]
movzx r10d, WORD PTR [rsi+50]
movzx r9d, WORD PTR [rsi+48]
movzx r8d, WORD PTR [rsi+46]
mov QWORD PTR [rbp-64], rax
movzx eax, WORD PTR [rsi+36]
movzx edx, WORD PTR [rsi+42]
mov QWORD PTR [rbp-72], rax
movzx eax, WORD PTR [rsi+34]
mov QWORD PTR [rbp-80], rax
movzx eax, WORD PTR [rsi+32]
mov QWORD PTR [rbp-88], rax
movzx eax, WORD PTR [rsi+30]
movzx r15d, BYTE PTR [rdi+r15]
mov QWORD PTR [rbp-96], rax
movzx eax, WORD PTR [rsi+28]
vmovd xmm2, r15d
vpinsrb xmm2, xmm2, BYTE PTR [rdi+r14], 1
mov QWORD PTR [rbp-104], rax
movzx eax, WORD PTR [rsi+26]
mov QWORD PTR [rbp-112], rax
movzx eax, WORD PTR [rsi+24]
mov QWORD PTR [rbp-120], rax
movzx eax, WORD PTR [rsi+22]
mov QWORD PTR [rbp-128], rax
movzx eax, WORD PTR [rsi+20]
mov QWORD PTR [rbp-136], rax
movzx eax, WORD PTR [rsi+18]
mov QWORD PTR [rbp-144], rax
movzx eax, WORD PTR [rsi+16]
mov QWORD PTR [rbp-152], rax
movzx eax, WORD PTR [rsi+14]
mov QWORD PTR [rbp-160], rax
movzx eax, WORD PTR [rsi+12]
mov QWORD PTR [rbp-168], rax
movzx eax, WORD PTR [rsi+10]
mov QWORD PTR [rbp-176], rax
movzx eax, WORD PTR [rsi+8]
mov QWORD PTR [rbp-184], rax
movzx eax, WORD PTR [rsi+6]
mov QWORD PTR [rbp-192], rax
movzx eax, WORD PTR [rsi+4]
mov QWORD PTR [rbp-200], rax
movzx eax, WORD PTR [rsi+2]
movzx esi, WORD PTR [rsi]
movzx r13d, BYTE PTR [rdi+r13]
movzx r8d, BYTE PTR [rdi+r8]
movzx edx, BYTE PTR [rdi+rdx]
movzx ebx, BYTE PTR [rdi+rbx]
movzx r10d, BYTE PTR [rdi+r10]
vmovd xmm7, r13d
vmovd xmm1, r8d
vpinsrb xmm1, xmm1, BYTE PTR [rdi+rcx], 1
mov rcx, QWORD PTR [rbp-56]
vmovd xmm5, edx
vmovd xmm3, ebx
mov rbx, QWORD PTR [rbp-72]
vmovd xmm6, r10d
vpinsrb xmm7, xmm7, BYTE PTR [rdi+r12], 1
vpinsrb xmm5, xmm5, BYTE PTR [rdi+rcx], 1
mov rcx, QWORD PTR [rbp-64]
vpinsrb xmm6, xmm6, BYTE PTR [rdi+r9], 1
vpinsrb xmm3, xmm3, BYTE PTR [rdi+r11], 1
vpunpcklwd xmm2, xmm2, xmm7
movzx edx, BYTE PTR [rdi+rcx]
mov rcx, QWORD PTR [rbp-80]
vpunpcklwd xmm1, xmm1, xmm5
vpunpcklwd xmm3, xmm3, xmm6
vmovd xmm0, edx
movzx edx, BYTE PTR [rdi+rcx]
mov rcx, QWORD PTR [rbp-96]
vpunpckldq xmm2, xmm2, xmm3
vpinsrb xmm0, xmm0, BYTE PTR [rdi+rbx], 1
mov rbx, QWORD PTR [rbp-88]
vmovd xmm4, edx
movzx edx, BYTE PTR [rdi+rcx]
mov rcx, QWORD PTR [rbp-112]
vpinsrb xmm4, xmm4, BYTE PTR [rdi+rbx], 1
mov rbx, QWORD PTR [rbp-104]
vpunpcklwd xmm0, xmm0, xmm4
vpunpckldq xmm0, xmm1, xmm0
vmovd xmm1, edx
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm1, xmm1, BYTE PTR [rdi+rbx], 1
mov rcx, QWORD PTR [rbp-128]
mov rbx, QWORD PTR [rbp-120]
vpunpcklqdq xmm0, xmm2, xmm0
vmovd xmm8, edx
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm8, xmm8, BYTE PTR [rdi+rbx], 1
mov rcx, QWORD PTR [rbp-144]
mov rbx, QWORD PTR [rbp-136]
vmovd xmm4, edx
vpunpcklwd xmm1, xmm1, xmm8
vpinsrb xmm4, xmm4, BYTE PTR [rdi+rbx], 1
movzx edx, BYTE PTR [rdi+rcx]
mov rbx, QWORD PTR [rbp-152]
mov rcx, QWORD PTR [rbp-160]
vmovd xmm7, edx
movzx eax, BYTE PTR [rdi+rax]
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm7, xmm7, BYTE PTR [rdi+rbx], 1
mov rcx, QWORD PTR [rbp-176]
mov rbx, QWORD PTR [rbp-168]
vmovd xmm5, eax
vmovd xmm2, edx
vpinsrb xmm5, xmm5, BYTE PTR [rdi+rsi], 1
vpunpcklwd xmm4, xmm4, xmm7
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm2, xmm2, BYTE PTR [rdi+rbx], 1
vpunpckldq xmm1, xmm1, xmm4
mov rbx, QWORD PTR [rbp-184]
mov rcx, QWORD PTR [rbp-192]
vmovd xmm6, edx
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm6, xmm6, BYTE PTR [rdi+rbx], 1
mov rbx, QWORD PTR [rbp-200]
vmovd xmm3, edx
vpunpcklwd xmm2, xmm2, xmm6
vpinsrb xmm3, xmm3, BYTE PTR [rdi+rbx], 1
add rsp, 40
vpunpcklwd xmm3, xmm3, xmm5
vpunpckldq xmm2, xmm2, xmm3
pop rbx
pop r10
vpunpcklqdq xmm1, xmm1, xmm2
pop r12
pop r13
vinserti128 ymm0, ymm0, xmm1, 0x1
pop r14
pop r15
pop rbp
lea rsp, [r10-8]
ret
, offset , , , , 16- offset, ( 64- ) . , offset ( 64 ) : , , , offset.
4.9.2, , 7.2.
icc clang - , offset movzx, vpinsrb offset :
gather256(char*, unsigned short*): # @gather256(char*, unsigned short*)
movzx eax, word ptr [rsi + 30]
movzx eax, byte ptr [rdi + rax]
vmovd xmm0, eax
movzx eax, word ptr [rsi + 28]
vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 1
movzx eax, word ptr [rsi + 26]
vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 2
movzx eax, word ptr [rsi + 24]
...
vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 14
movzx eax, word ptr [rsi]
vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 15
movzx eax, word ptr [rsi + 62]
movzx eax, byte ptr [rdi + rax]
vmovd xmm1, eax
movzx eax, word ptr [rsi + 60]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 1
movzx eax, word ptr [rsi + 58]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 2
movzx eax, word ptr [rsi + 56]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 3
movzx eax, word ptr [rsi + 54]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 4
movzx eax, word ptr [rsi + 52]
...
movzx eax, word ptr [rsi + 32]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 15
vinserti128 ymm0, ymm1, xmm0, 1
ret
. vinserti128 xmm , -, , vpinserb 128 . , , , , 2 5 () 1 . , , , 1 32 , 32 ( xmm , pinsrb, 1 1.
32 gcc? . :
uint64_t gather64(char *array, uint16_t *offset) {
uint64_t ret;
char *p = (char *)&ret;
p[0] = array[offset[0]];
p[1] = array[offset[1]];
p[2] = array[offset[2]];
p[3] = array[offset[3]];
p[4] = array[offset[4]];
p[5] = array[offset[5]];
p[6] = array[offset[6]];
p[7] = array[offset[7]];
return ret;
}
__m256i gather256_gcc(char *array, uint16_t *offset) {
return _mm256_set_epi64x(
gather64(array, offset),
gather64(array + 8, offset + 8),
gather64(array + 16, offset + 16),
gather64(array + 24, offset + 24)
);
}
8 array , _mm256_set_epi64x. 2 1 8- 64- , 1 2.
"" inline code gcc:
gather256_gcc(char*, unsigned short*):
lea r10, [rsp+8]
and rsp, -32
push QWORD PTR [r10-8]
push rbp
mov rbp, rsp
push r10
movzx eax, WORD PTR [rsi+48]
movzx eax, BYTE PTR [rdi+24+rax]
mov BYTE PTR [rbp-24], al
movzx eax, WORD PTR [rsi+50]
movzx eax, BYTE PTR [rdi+24+rax]
mov BYTE PTR [rbp-23], al
movzx eax, WORD PTR [rsi+52]
movzx eax, BYTE PTR [rdi+24+rax]
mov BYTE PTR [rbp-22], al
...
movzx eax, WORD PTR [rsi+62]
movzx eax, BYTE PTR [rdi+24+rax]
mov BYTE PTR [rbp-17], al
movzx eax, WORD PTR [rsi+32]
vmovq xmm0, QWORD PTR [rbp-24]
movzx eax, BYTE PTR [rdi+16+rax]
movzx edx, WORD PTR [rsi+16]
mov BYTE PTR [rbp-24], al
movzx eax, WORD PTR [rsi+34]
movzx edx, BYTE PTR [rdi+8+rdx]
movzx eax, BYTE PTR [rdi+16+rax]
mov BYTE PTR [rbp-23], al
...
movzx eax, WORD PTR [rsi+46]
movzx eax, BYTE PTR [rdi+16+rax]
mov BYTE PTR [rbp-17], al
mov rax, QWORD PTR [rbp-24]
mov BYTE PTR [rbp-24], dl
movzx edx, WORD PTR [rsi+18]
vpinsrq xmm0, xmm0, rax, 1
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-23], dl
movzx edx, WORD PTR [rsi+20]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-22], dl
movzx edx, WORD PTR [rsi+22]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-21], dl
movzx edx, WORD PTR [rsi+24]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-20], dl
movzx edx, WORD PTR [rsi+26]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-19], dl
movzx edx, WORD PTR [rsi+28]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-18], dl
movzx edx, WORD PTR [rsi+30]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-17], dl
movzx edx, WORD PTR [rsi]
vmovq xmm1, QWORD PTR [rbp-24]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-24], dl
movzx edx, WORD PTR [rsi+2]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-23], dl
movzx edx, WORD PTR [rsi+4]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-22], dl
...
movzx edx, WORD PTR [rsi+12]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-18], dl
movzx edx, WORD PTR [rsi+14]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-17], dl
vpinsrq xmm1, xmm1, QWORD PTR [rbp-24], 1
vinserti128 ymm0, ymm0, xmm1, 0x1
pop r10
pop rbp
lea rsp, [r10-8]
ret
4 () , , 32 , , 40- ( , , , ). gather64 32- . 64- , , , -, .
, , . , " " vpinsrb, clang icc. , gcc .
, offset array?
offset, .
vgatherdd 5 . 256- . , vpgather vpgatherdd, 8 32- , 32- . , 4 32 , - .
, "" 32- . , 4 256- , 8 , 24 , . vpblendw , vpblendb , 3 5 uops ( ?).
, - :
- 4
movups, 4 vpgatherdd reg regs ( ) - 4
vpgatherdd - 2
vpblendw (4 → 2) - 1
movups vpblendb ( ) - 1
vpblendb (2 → 1)
vpgatherdd, 9 , 3 5, 3 2.25 , ( vpgatherdd 5). Broadwell vpgather , 0,9 vpgatherdd, 29 . , , , 32 .
, :
- 0,9
vpgatherdd. , , 29 (, movups ). vpgatherdd Skylake, 0.6 , Skylake. ( vpinsrb AVX512BW, k -register, vpgatherdd zmm , ymm (InstLatx64).)- ,
array. . , offset , 16 , pshufb . "" , ( ), pshufb , .
: , , 1, 2, 3 4 . , ( , ) . .
- , 4 , : 3 offset, . , , pshufb , .
1 SSE/AVX, , reg-reg: reg-reg 2 uops 5, 0,5 2. , /, 5. vpbroadcastd/q .
2 : L1, : , L2.