AVX1 does not have 256b whole operations, only FP. Therefore, I assume that you are really looking for an alternative __m128i _mm_srlv_epi32(). Using extractf128 / insertf128, you can easily do this for 256b vectors, but it's better to just use more than 128b downloads / storages, especially. if you have an AVX2 version that can run on processors that support AVX2. (Existing AVX1-only processors typically have 128-bit download / storage paths, so 256-bit loads / storages are hardly an advantage.)
( , movd/pextrd/pinsrd), - , , , , .
, , : 4 ( ) 3 .
: 2: 32- 2 count. . .
, , 64b . ( , 64b. ( ), , .
4 xmm-, , . , .
AVX AVX2, , AVX2. , Intel SnB/IvB. , 128 , .
vpsrlq xmm2, xmm0, 32 ; xmm2 = [ 0 D 0 B ]
vpunpckhqdq xmm4, xmm2, xmm0 ; xmm4 = [ D C 0 D ]
vpshufd xmm3, xmm4, 0b01010110 ; xmm3 = [ 0 0 0 C ]
vblendps xmm1, xmm2, xmm0, 0b0001 ; xmm1 = [ 0 D 0 A ]
; or
vpblendw xmm1, xmm2, xmm0, 0b00000011 ; xmm1 = [ 0 D 0 A ]
vblendps p0/5 SnB/IvB. vpblendw p1/p5 SnB/IvB. Haswell/SKL p015 p5, blendps ( , PAND). SnB, , . , FP , , . , , pblendw SnB/IvB. blendps.
, [ 0 -1 0 -1 ], xmm3. , , , //.
vpcmpeqw xmm5, xmm5,xmm5 ; all-ones
vpsrlq xmm5, xmm5, 32 ; [ 0 -1 0 -1 ]: generate the mask on the fly if desired
vpand xmm1, xmm5, xmm0 ; [ 0 C 0 A ]
vpsrlq xmm2, xmm0, 32 ; [ 0 D 0 B ]
vpunpckhqdq xmm3, xmm1,xmm1 ; [ 0 C 0 C ] ; saves 1B vs. the equivalent pshufd: no imm8 byte
vpunpckhqdq xmm4, xmm2,xmm2 ; [ 0 D 0 D ]
: , Skylake, VPSRLVD ymm,ymm,ymm (1 ), PSRLD xmm,xmm,xmm (2 ). Immediate-count PSRLD 1 . ( Agner Fog insn tables).
@BeeOnRope , Agner - , , . 2c (xmm) 4c (ymm), , 1c - 3c .
uop counts:
, :
movaps [rsp - 16], xmm0
shr [rsp - 16], 3 ; 3 uops with a memory-destination. 5 uops for variable count with a memory destination
shr [rsp - 12], 1
shr [rsp - 8], 4
shr [rsp - 4], 1
movaps xmm0, [rsp - 16] ; store-forwarding stall here from the 4x 32b stores to the 128b load
, , -count:
vmovd eax, xmm0 ; 1 uop
vmovd ecx, xmm1 ; 1 uop
shr eax, cl ; 3 uops because of CISC stupidity
vmovd xmm2, eax ; 1 uop
vpextrd eax, xmm0, 1 ; 2 uops
vpextrd ecx, xmm1, 1 ; 2 uops
shr eax, cl ; 3 uops because of CISC stupidity
vpinsrd xmm2, eax, 1 ; 2 uops
... repeat twice more, for indices 2 and 3
, all-registers - 6uops + 9uops * 3, 33 uops.
- 14 fops-domain uops, , . pextr ing counts ecx, 2 , .
, SSE/AVX , . -
- 4 ,
- 8 uops
vpsrld xmm,xmm insns - 3 uops
vpblendw vblendps . - total = 15 fops-domain uops AVX1.
, , - store/scalar shuffle/reload, .
, uops . , .
:
- Skylake:
vpsrlvd ymm, ymm, ymm - 1 uop, 1c latency, 0,5 c. - Haswell/BDW:
vpsrlvd ymm, ymm, ymm - 3 , 2 , 2 .
, 256b-. , , 128b.
Haswell ( SnB/IvB) SSE, , . , insn parallelism, .
SSE4.1 pmulld .
On SnB/IvB, SSE4.1 pmulld - 1 uop, 5c , 1c .
Haswell, 2 uops, 10c latency, 2 c . ( Skylake, uops p1, p0)
2 c. - . , 2 c , .
(, 0..7), SSSE3 pshufb LUT 2 ^ c. 0 1 (2 0), 0 .
movdqa xmm2, xmm5 ; avoid this with AVX
pshufb xmm2, xmm5 ; 2^count
pand xmm2, xmm4 ; zero all but the low byte in each element
pmulld xmm0, xmm2 ; data * 2^count
Intel SnB/IvB: 3 uops ( movdqa, AVX). : 7c. : 5c. : 1c ( ).
Haswell : 5c. Penryn/Nehalem uops pmulld, SnB, , Haswell.
LUT 64b, , movq. .
, LUT [ D-8 C-8 B-8 A-8 ], 32b .. .. , C-8 , C<8, BLENDVB . , , , shift/blend-instant.
pshufb set1_epi32(1). LUT 1,8, 0 LUT ( 0). " " :
pcmpeqw xmm4,xmm4 ; all-ones
psubd xmm1, xmm4 ; shift_counts -= -1
movdqa xmm2, xmm5
pshufb xmm2, xmm1 ; 2^count
pmulld xmm0, xmm2 ; data * 2^count
, insn. (set1_epi32 (0xff) pcmpeqw/psrld 24, " ", insn.)
:
, : ( , 0xF). , 8 .
PSHUFB LUT, . . , pshufb 2<<count.
32b, [ 0 0 D+8 D | 0 0 C+8 C | ... ] . LUT, .