I want to vectorize fortran below using SIMD directives
DO IELEM = 1 , NELEM
X(IKLE(IELEM)) = X(IKLE(IELEM)) + W(IELEM)
ENDDO
And I used the avx2 instruction. The program is compiled
ifort main_vec.f -simd -g -pg -O2 -vec-report6 -o vec.out -xcore-avx2 -align array32byte
Then I would like to add a sentence VECTORLENGTH(n)after SIMD. If there is no such sentence or n = 2, 4, the information does not provide information on the unroll coefficient
if n = 8, 16, vectorization support: unroll factor set to 2.
I read an Intel article on vectorization support: the unroll coefficient is set to xxxx So, I think the loop is deployed to something like:
DO IELEM = 1 , NELEM, 2
X(IKLE(IELEM)) = X(IKLE(IELEM)) + W(IELEM)
X(IKLE(IELEM+1)) = X(IKLE(IELEM+1)) + W(IELEM+1)
ENDDO
Then 2 X go to the vector register, 2 W go to another, add. But how does the VECTORLENGTH value work? Or maybe I really don't understand what the length of the vector means.
avx2, DOUBLE PRECISION X, ?
SSE2, VL = 8, , unroll 2. 4 2.
.loc 1 114 is_stmt 1
movslq main_vec_$IKLE.0.1(,%rdx,4), %rsi
..LN202:
movslq 4+main_vec_$IKLE.0.1(,%rdx,4), %rdi
..LN203:
movslq 8+main_vec_$IKLE.0.1(,%rdx,4), %r8
..LN204:
movslq 12+main_vec_$IKLE.0.1(,%rdx,4), %r9
..LN205:
movsd -8+main_vec_$X.0.1(,%rsi,8), %xmm0
..LN206:
movslq 16+main_vec_$IKLE.0.1(,%rdx,4), %r10
..LN207:
movhpd -8+main_vec_$X.0.1(,%rdi,8), %xmm0
..LN208:
movslq 20+main_vec_$IKLE.0.1(,%rdx,4), %r11
..LN209:
movsd -8+main_vec_$X.0.1(,%r8,8), %xmm1
..LN210:
movslq 24+main_vec_$IKLE.0.1(,%rdx,4), %r14
..LN211:
addpd main_vec_$W.0.1(,%rdx,8), %xmm0
..LN212:
movhpd -8+main_vec_$X.0.1(,%r9,8), %xmm1
..LN213:
..LN214:
movslq 28+main_vec_$IKLE.0.1(,%rdx,4), %r15
..LN215:
movsd -8+main_vec_$X.0.1(,%r10,8), %xmm2
..LN216:
addpd 16+main_vec_$W.0.1(,%rdx,8), %xmm1
..LN217:
movhpd -8+main_vec_$X.0.1(,%r11,8), %xmm2
..LN218:
..LN219:
movsd -8+main_vec_$X.0.1(,%r14,8), %xmm3
..LN220:
addpd 32+main_vec_$W.0.1(,%rdx,8), %xmm2
..LN221:
movhpd -8+main_vec_$X.0.1(,%r15,8), %xmm3
..LN222:
..LN223:
addpd 48+main_vec_$W.0.1(,%rdx,8), %xmm3
..LN224:
movsd %xmm0, -8+main_vec_$X.0.1(,%rsi,8)
..LN225:
.loc 1 113 is_stmt 1
addq $8, %rdx
..LN226:
.loc 1 114 is_stmt 1
psrldq $8, %xmm0
..LN227:
.loc 1 113 is_stmt 1
cmpq $26000, %rdx
..LN228:
.loc 1 114 is_stmt 1
movsd %xmm0, -8+main_vec_$X.0.1(,%rdi,8)
..LN229:
movsd %xmm1, -8+main_vec_$X.0.1(,%r8,8)
..LN230:
psrldq $8, %xmm1
..LN231:
movsd %xmm1, -8+main_vec_$X.0.1(,%r9,8)
..LN232:
movsd %xmm2, -8+main_vec_$X.0.1(,%r10,8)
..LN233:
psrldq $8, %xmm2
..LN234:
movsd %xmm2, -8+main_vec_$X.0.1(,%r11,8)
..LN235:
movsd %xmm3, -8+main_vec_$X.0.1(,%r14,8)
..LN236:
psrldq $8, %xmm3
..LN237:
movsd %xmm3, -8+main_vec_$X.0.1(,%r15,8)
..LN238: