AVX-512 and Branching

I am confused by what masking can do in theory regarding branches. Say I have Skylake-SP (ha, I wish ..), and we ignore the compiler's capabilities as much as possible in theory:

If the conditional branch depends on the static flag, and all branches set the array to the result of the calculation, assuming that the compiler somehow does not optimize it for two separate loops, can it vectorize?

do i = 1, nx
  if (my_flag .eq. 0) then
    a(i) = b(i) ** 2
  else
    a(i) = b(i) ** 3
  end if
end do

If only, when a subset of branches sets the value in question, can it vectorize?

do i = 1, nx
  if (my_flag .eq. 0) then
    a(i) = b(i) ** 2
  end if
end do

If a conditional branch itself depends on vector data, can it vectorize?

do i = 1, nx
  if (c(i) > 0) then
    a(i) = b(i) ** 2
  else
    a(i) = b(i) ** 3
  end if
end do
+6
source share
2

.. , , - , . , Fortran, , , , , .

, , . , AVX-512.


, , , , , , . , , , .

, , .

, , , "", ! , .

, , , (1) (3) " ", (2) , (2) a[i] b[i] if body, if . , myflag == false, .

, . -, , - bool. , a, b c, - f :

do i = 1, nx
  if (f(i) > 0) then
    a(i) = g(b(i), c(i));
  else
    a(i) = h(b(i), c(i));
  end if
end do

f(i), , g, h b(i) c(i). , g h b c.

:

void example1(bool* f, int* __restrict__ a, int* __restrict__ b, int* __restrict__ c, size_t n) {
    for (size_t i = 0; i < n; i++) {
        if (f[i]) {
            a[i] = b[i];
        } else {
            a[i] = c[i];
        }
    }
}

void example2(bool* f, int* __restrict__ a, int* __restrict__ b, int* __restrict__ c, size_t n) {
    for (size_t i = 0; i < n; i++) {
        if (f[i]) {
            a[i] = b[i] + c[i] ;
        } else {
            a[i] = b[i] - c[i] * 2 + 1 ;
        }
    }
}

, ? - b[i], c[i] . - b[i], c[i], .

, -, , b[i] c[i]. , gcc - . clang . , icc - , vpmaskmovd, , .

godbolt.

, , , icc. , icc . , , 2.

, , , , , b c [0, n), . , b[i] = b[i]; c[i] = c[i]; ... + c[i] * 0, , , , , . , "", : . , , , .

, , ? , . , 4K x86 , . , , "", .

, : , 0 1, , , . , , , 3 , , .


2 , AVX : icc - , AVX, vpmaskmovd/q vmaskmovps/pd.

3 , , , , if / - 0 - 1. , , : all-zeros, all-ones , .

+3

, asm SSE2/SSE4.1 ( blendps)/AVX/AVX-512, , , gcc7. 2/clang5.0/ICC18 .

Skylake-AVX512 (. ), 64- 1,25 ( , ). , , 1,33 1,5 , L1D. L2, 2x 64B 64B .

C , gcc, clang ICC - , : . + asm Godbolt.

-ffast-math gcc . IDK, , -, FP.

, Clang tmp*tmp tmp*tmp*tmp .

gcc   , .

ICC KNOTW , , .

(**3 **2) if else 3 . ( gcc, ICC clang , , .)

ICC - 256b. , , ? , ? gcc 8.0 snapshot , gcc7.2 ZMM-.


AVX-512 , , - SIMD ( SIMD) . , , .

0 : x + 0 = x. , x + (y&mask) no-op, x+y, -. . if condition in intrinsics. (Fun trick: -1 0, , ).

, 1 , .

, , ?

, . , , , , .


, , . ( , all-zero all-one ).

if (c(i) > 0)

, c . AVX512 16 float .

; with zmm0 = 0.0 in all elements, from vxorps xmm0,xmm0,xmm0 outside the loop.
vcmpps    k1, zmm0, [rdx],  _CMP_NLT_UQ     ; !(0 < c(i))

( ), k1 , c(i) > 0 . 2- , , not-more-than. ( >= <, ( NaN) . FP 4 : ///, , , ( , , ) 4 . -ffast-math, NaN.

, AVX512 .

vcmpltps    k1,        zmm1, zmm2       ; k1 = zmm1<zmm2
vcmpltps    k2{k1}{z}, zmm3, zmm4       ; k2 = (zmm3<zmm4) & (zmm1<zmm2)

k2 0, zmm3k1 , k1 .


  if (c(i) > 0) then
    a(i) = b(i) ** 2
  else
    a(i) = b(i) ** 3
  end if

b(i) * b(i). b(i)**3 b(i) .

vmovups    zmm1, [rsi]       ; load a vector from b(i)
vmulps     zmm2, zmm1, zmm1  ; zmm2 = zmm1*zmm1 = b(i)**2

AVX-512 () .

vmulps     zmm2{k1}, zmm2, zmm1  ; zmm2 *= zmm1   for elements where k1 is true

vmovups    [rdi], zmm2           ; store all 16 elements into a(i)

BTW, AVX512 . SIMD [rdi], blend, [rdi]. , ( a(i) ) , AVX1/AVX2.


: ( NASM)

 ; x86-64 System V calling convention
 ; args: rdi = a() output array.
 ;       rsi = b() input array
 ;       rdx = c() array to be tested for positive numbers
 ;       rcx = count (in elements)
 ; preferably all 64-byte aligned, but will work slowly if some aren't
 ; rcx must be >= 16, and a multiple of 16, because I didn't write any cleanup code

global square_or_cube
square_or_cube: 

    vxorps     xmm0,  xmm0,xmm0

 .loop:                          ; do {
    vcmpps     k1, zmm0, [rdx], 21    ; _CMP_NLT_UQ  ; !(0 < c(i))

    vmovups    zmm1, [rsi]            ; load a vector from b(i)
    vmulps     zmm2,     zmm1, zmm1   ; zmm2 = zmm1*zmm1 = b(i)**2

    vmulps     zmm2{k1}, zmm2, zmm1   ; zmm2 *= zmm1   for elements where k1 is true, otherwise unmodified.
    vmovups    [rdi], zmm2            ; store all 16 elements into a(i)

    ; TODO: unroll some and/or use indexed addressing mode tricks to save instructions
    add         rdi, 64      ; pointer increments
    add         rsi, 64
    add         rdx, 64

    sub         rcx, 16         ;  count -= 16 
    ja        .loop             ; } while(count>0);

IACA ( -, asm). IACA, vmulps uop, - - uop front-end. ( .) , , IACA , SKL-SP, .

$ iaca.sh -arch SKX avx512-conditional
Intel(R) Architecture Code Analyzer Version - 2.3 build:246dfea (Thu, 6 Jul 2017 13:38:05 +0300)
Analyzed File - avx512-conditional
Binary Format - 64Bit
Architecture  - SKX
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 1.50 Cycles       Throughput Bottleneck: FrontEnd

Port Binding In Cycles Per Iteration:
---------------------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |  6   |  7   |
---------------------------------------------------------------------------------------
| Cycles | 1.5    0.0  | 0.0  | 1.0    1.0  | 1.0    1.0  | 1.0  | 1.5  | 1.0  | 1.0  |
---------------------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis

| Num Of |                    Ports pressure in cycles                     |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |  6  |  7  |    |
---------------------------------------------------------------------------------
|   2^   |           |     | 1.0   1.0 |           |     | 1.0 |     |     | CP | vcmpps k1, zmm0, zmmword ptr [rdx], 0x15
|   1    |           |     |           | 1.0   1.0 |     |     |     |     |    | vmovups zmm1, zmmword ptr [rsi]
|   1    | 1.0       |     |           |           |     |     |     |     | CP | vmulps zmm2, zmm1, zmm1
|   1    | 0.5       |     |           |           |     | 0.5 |     |     | CP | vmulps zmm2{k1}, zmm2, zmm1
|   2^   |           |     |           |           | 1.0 |     |     | 1.0 |    | vmovups zmmword ptr [rdi], zmm2
|   1    |           |     |           |           |     |     | 1.0 |     |    | sub rcx, 0x10
|   0F   |           |     |           |           |     |     |     |     |    | jnbe 0xffffffffffffffdd
Total Num Of Uops: 8

AVX-512 vfpclassps (C/++ intrinsic [_mm512_fpclass_ps_mask] 4, asm vfpclasspd ( )), FP . , , .
 ( , IACA, . 3- InstLatx64. Agner Fog AVX2 cmpps Skylake-S (-AVX512) 4 , , AVX512 .

, , , vfpclassps , , -Inf, , NaN, -0.0 +0.0.

vfpclassps    k1, [rdx], 0x1 | 0x2 | 0x4 | 0x10 | 0x40 | 0x80     ; QNaN | -0.0 | +0.0 | -Infinity | Negative (finite) | SNaN
; k1 = a 16-bit bitmap of which elements (from memory at [rdx]) need an extra multiply

vpfclassps , +0.0 -0.0, , (, AVX2 vblendps, - , ).

, , all-zeros.


related: AVX512 2**floor(x) (vscalefpd), ( ). Xeon Phi AVX512ER, 2**x ( x), , SKL-SP ​​ AVX512ER.


NASM IACA_start/end:

iaca_marks.h C/++.

%if 1
%macro  IACA_start 0
     mov ebx, 111
     db 0x64, 0x67, 0x90
%endmacro
%macro  IACA_end 0
     mov ebx, 222
     db 0x64, 0x67, 0x90
%endmacro
%else
%define IACA_start
%define IACA_end
%endif

, .


-

. IDK, - , , , .

; rdi = destination
; rsi = source
; edx = condition
; rcx = element count
global square_or_cube
square_or_cube: 

 .loop:                          ; do {
    vmovups    zmm1, [rsi]            ; load a vector from b(i)
    vmulps     zmm2, zmm1, zmm1   ; zmm2 = zmm1*zmm1 = b(i)**2

    test       edx,edx
    jz        .only_square        ; test-and-branch to conditionally skip the 2nd multiply
    vmulps     zmm2, zmm2, zmm1   ; zmm2 *= zmm1
   .only_square:

    vmovups    [rdi], zmm2        ; store all 16 elements into a(i)

    add         rdi, 64      ; pointer increments
    add         rsi, 64

    sub         rcx, 16         ;  count -= 16 
    ja        .loop             ; } while(count>0);
+5

All Articles