, asm SSE2/SSE4.1 ( blendps)/AVX/AVX-512, , , gcc7. 2/clang5.0/ICC18 .
Skylake-AVX512 (. ), 64- 1,25 ( , ). , , 1,33 1,5 , L1D. L2, 2x 64B 64B .
C , gcc, clang ICC - , : . + asm Godbolt.
-ffast-math gcc . IDK, , -, FP.
, Clang tmp*tmp tmp*tmp*tmp .
gcc
, .
ICC KNOTW , , .
(**3 **2) if else 3 . ( gcc, ICC clang , , .)
ICC - 256b. , , ? , ? gcc 8.0 snapshot , gcc7.2 ZMM-.
AVX-512 , , - SIMD ( SIMD) . , , .
0 : x + 0 = x. , x + (y&mask) no-op, x+y, -. . if condition in intrinsics. (Fun trick: -1 0, , ).
, 1 , .
, , ?
, . , , , , .
, , . ( , all-zero all-one ).
if (c(i) > 0)
, c . AVX512 16 float .
; with zmm0 = 0.0 in all elements, from vxorps xmm0,xmm0,xmm0 outside the loop.
vcmpps k1, zmm0, [rdx], _CMP_NLT_UQ ; !(0 < c(i))
( ), k1 , c(i) > 0 . 2- , , not-more-than. ( >= <, ( NaN) . FP 4 : ///, , , ( , , ) 4 . -ffast-math, NaN.
, AVX512 .
vcmpltps k1, zmm1, zmm2 ; k1 = zmm1<zmm2
vcmpltps k2{k1}{z}, zmm3, zmm4 ; k2 = (zmm3<zmm4) & (zmm1<zmm2)
k2 0, zmm3k1 , k1 .
if (c(i) > 0) then
a(i) = b(i) ** 2
else
a(i) = b(i) ** 3
end if
b(i) * b(i). b(i)**3 b(i) .
vmovups zmm1, [rsi] ; load a vector from b(i)
vmulps zmm2, zmm1, zmm1 ; zmm2 = zmm1*zmm1 = b(i)**2
AVX-512 () .
vmulps zmm2{k1}, zmm2, zmm1 ; zmm2 *= zmm1 for elements where k1 is true
vmovups [rdi], zmm2 ; store all 16 elements into a(i)
BTW, AVX512 . SIMD [rdi], blend, [rdi]. , ( a(i) ) , AVX1/AVX2.
: ( NASM)
; x86-64 System V calling convention
; args: rdi = a() output array.
; rsi = b() input array
; rdx = c() array to be tested for positive numbers
; rcx = count (in elements)
; preferably all 64-byte aligned, but will work slowly if some aren't
; rcx must be >= 16, and a multiple of 16, because I didn't write any cleanup code
global square_or_cube
square_or_cube:
vxorps xmm0, xmm0,xmm0
.loop: ; do {
vcmpps k1, zmm0, [rdx], 21 ; _CMP_NLT_UQ ; !(0 < c(i))
vmovups zmm1, [rsi] ; load a vector from b(i)
vmulps zmm2, zmm1, zmm1 ; zmm2 = zmm1*zmm1 = b(i)**2
vmulps zmm2{k1}, zmm2, zmm1 ; zmm2 *= zmm1 for elements where k1 is true, otherwise unmodified.
vmovups [rdi], zmm2 ; store all 16 elements into a(i)
; TODO: unroll some and/or use indexed addressing mode tricks to save instructions
add rdi, 64 ; pointer increments
add rsi, 64
add rdx, 64
sub rcx, 16 ; count -= 16
ja .loop ; } while(count>0);
IACA ( -, asm). IACA, vmulps uop, - - uop front-end. ( .) , , IACA , SKL-SP, .
$ iaca.sh -arch SKX avx512-conditional
Intel(R) Architecture Code Analyzer Version - 2.3 build:246dfea (Thu, 6 Jul 2017 13:38:05 +0300)
Analyzed File - avx512-conditional
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 1.50 Cycles Throughput Bottleneck: FrontEnd
Port Binding In Cycles Per Iteration:
---------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
---------------------------------------------------------------------------------------
| Cycles | 1.5 0.0 | 0.0 | 1.0 1.0 | 1.0 1.0 | 1.0 | 1.5 | 1.0 | 1.0 |
---------------------------------------------------------------------------------------
N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | |
---------------------------------------------------------------------------------
| 2^ | | | 1.0 1.0 | | | 1.0 | | | CP | vcmpps k1, zmm0, zmmword ptr [rdx], 0x15
| 1 | | | | 1.0 1.0 | | | | | | vmovups zmm1, zmmword ptr [rsi]
| 1 | 1.0 | | | | | | | | CP | vmulps zmm2, zmm1, zmm1
| 1 | 0.5 | | | | | 0.5 | | | CP | vmulps zmm2{k1}, zmm2, zmm1
| 2^ | | | | | 1.0 | | | 1.0 | | vmovups zmmword ptr [rdi], zmm2
| 1 | | | | | | | 1.0 | | | sub rcx, 0x10
| 0F | | | | | | | | | | jnbe 0xffffffffffffffdd
Total Num Of Uops: 8
AVX-512 vfpclassps (C/++ intrinsic [_mm512_fpclass_ps_mask] 4, asm vfpclasspd ( )), FP . , , .
( , IACA, . 3- InstLatx64. Agner Fog AVX2 cmpps Skylake-S (-AVX512) 4 , , AVX512 .
, , , vfpclassps , , -Inf, , NaN, -0.0 +0.0.
vfpclassps k1, [rdx], 0x1 | 0x2 | 0x4 | 0x10 | 0x40 | 0x80 ; QNaN | -0.0 | +0.0 | -Infinity | Negative (finite) | SNaN
; k1 = a 16-bit bitmap of which elements (from memory at [rdx]) need an extra multiply
vpfclassps , +0.0 -0.0, , (, AVX2 vblendps, - , ).
, , all-zeros.
related: AVX512 2**floor(x) (vscalefpd), ( ). Xeon Phi AVX512ER, 2**x ( x), , SKL-SP AVX512ER.
NASM IACA_start/end:
iaca_marks.h C/++.
%if 1
%macro IACA_start 0
mov ebx, 111
db 0x64, 0x67, 0x90
%endmacro
%macro IACA_end 0
mov ebx, 222
db 0x64, 0x67, 0x90
%endmacro
%else
%define IACA_start
%define IACA_end
%endif
, .
-
. IDK, - , , , .
; rdi = destination
; rsi = source
; edx = condition
; rcx = element count
global square_or_cube
square_or_cube:
.loop: ; do {
vmovups zmm1, [rsi] ; load a vector from b(i)
vmulps zmm2, zmm1, zmm1 ; zmm2 = zmm1*zmm1 = b(i)**2
test edx,edx
jz .only_square ; test-and-branch to conditionally skip the 2nd multiply
vmulps zmm2, zmm2, zmm1 ; zmm2 *= zmm1
.only_square:
vmovups [rdi], zmm2 ; store all 16 elements into a(i)
add rdi, 64 ; pointer increments
add rsi, 64
sub rcx, 16 ; count -= 16
ja .loop ; } while(count>0);