Adding two vectors to the x86_64 assembly with AVX2 plus technical explanations

What am I doing wrong here? I get 4 zeros instead:

2
4
6
8

I would also like to modify my .asm function in order to skip longer vectors to smooth here. I just used a four-element vector so that I can sum this vector without a loop with 256-bit SIMD registers.

.cpp

#include <iostream>
#include <chrono>

extern "C" double *addVec(double *C, double *A, double *B, size_t &N);

int main()
{
    size_t N = 1 << 2;
    size_t reductions = N / 4;

    double *A = (double*)_aligned_malloc(N*sizeof(double), 32);
    double *B = (double*)_aligned_malloc(N*sizeof(double), 32);
    double *C = (double*)_aligned_malloc(N*sizeof(double), 32);

    for (size_t i = 0; i < N; i++)
    {
        A[i] = double(i + 1);
        B[i] = double(i + 1);
    }

    auto start = std::chrono::high_resolution_clock::now();

        double *out = addVec(C, A, B, reductions);

    auto finish = std::chrono::high_resolution_clock::now();

    for (size_t i = 0; i < N; i++)
    {
        std::cout << out[i] << std::endl;
    }

    std::cout << "\n\n";

    std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << " ns\n";

    std::cin.get();

    _aligned_free(A);
    _aligned_free(B);
    _aligned_free(C);

    return 0;
}

.asm

.data
; C -> RCX
; A -> RDX
; B -> r8
; N -> r9
.code
    addVec proc
        ;xor rbx, rbx
        align 16
        ;aIn:
            vmovapd ymm0, ymmword ptr [rdx]
            ;vmovapd ymm1, ymmword ptr [rdx + rbx + 4]
            vmovapd ymm2, ymmword ptr [r8]
            ;vmovapd ymm3, ymmword ptr [r8 + rbx + 4]

            vaddpd ymm0, ymm2, ymm3

            vmovapd ymmword ptr [rcx], ymm3
        ;inc rbx
        ;cmp rbx, qword ptr [r9]
        ;jl aIn
        mov rax, rcx    ; return the address of the output vector
    ret
    addVec endp
end

I would also like to get some other clarifications:

  • Are there eight 256-bit registers (ymm0-ymm7) for each core of my processor, or only eight?
  • Are all other registers, such as rax, rbx, etc., located in total or for each core?
  • 4 SIMD , ? , , 5 ? (4 SIMD + 1)
  • , - , ?:

    #pragma openmp parallel for

    for (size_t i = 0; i < reductions; i++)

    addVec(C + i, A + i, B + i)

    fork coreNumber + hyperThreading threads, SIMD ? , 4 * coreNumber ? hyperThreading ?


?:

.data
;// C -> RCX
;// A -> RDX
;// B -> r8
.code
    addVec proc
        ; One cycle 8 micro-op
            vmovapd ymm0, ymmword ptr [rdx]     ; 1 port
            vmovapd ymm1, ymmword ptr [rdx + 32]; 1 port
            vmovapd ymm2, ymmword ptr [r8]      ; 1 port
            vmovapd ymm3, ymmword ptr [r8 + 32] ; 1 port
            vfmadd231pd ymm0, ymm2, ymm4        ; 1 port
            vfmadd231pd ymm1, ymm3, ymm4        ; 1 port
            vmovapd ymmword ptr [rcx], ymm0     ; 1 port
            vmovapd ymmword ptr [rcx + 32], ymm1; 1 port

        ; Return the address of the output vector
        mov rax, rcx                            ; 1 port ?
    ret
    addVec endp
end

, , ?

.data
;// C -> RCX
;// A -> RDX
;// B -> r8
.code
    addVec proc
        ;align 16
        ; One cycle 5 micro-op ?
        vmovapd ymm0, ymmword ptr [rdx]     ; 1 port
        vmovapd ymm1, ymmword ptr [r8]      ; 1 port
        vfmadd231pd ymm0, ymm1, ymm2        ; 1 port
        vmovapd ymmword ptr [rcx], ymm0     ; 1 port

        ; Return the address of the output vector
        mov rax, rcx                        ; 1 port ?
    ret
    addVec endp
end
+4
1

, , , .

Intel, . , .asm

vaddpd ymm0, ymm2, ymm3

 vaddpd ymm3, ymm2, ymm0

- , .

extern "C" double *addVec(double * __restrict C, double * __restrict A, double * __restrict B, size_t &N) {
    __m256d x = _mm256_load_pd((const double*)A);
    __m256d y = _mm256_load_pd((const double*)B);
    __m256d z = _mm256_add_pd(x,y);
    _mm256_store_pd((double*)C, z);
    return C;
}

GCC Linux g++ -S -O3 -mavx -masm=intel -mabi=ms foo.cpp :

vmovapd ymm0, YMMWORD PTR [rdx]
mov     rax, rcx
vaddpd  ymm0, ymm0, YMMWORD PTR [r8]
vmovapd YMMWORD PTR [rcx], ymm0
vzeroupper
ret

vaddpd ymm0, ymm0, YMMWORD PTR [rdx] . , 2,4,6,8.

, x y z l1-memory-bandwidth-50-drop-in-efficiency-using-addresses-which-differ-by-4096. intrinsics . gcc -S objdump -d. , , obtaining-peak-bandwidth-on-haswell-in-the-l1-cache-only-getting-62. triad_fma_asm.asm pi: dd 3.14159 pi: dd 1.0. , , , .

:

  • ​​ . ​​ 16 (, rax, rbx, r8, r9,...) (, RFLAGS). 32- ​​ 256- , 64- - 256- . AVX-512 , 512- ( 32- ).

, , , .

  1. . .

  2. Core2 2006 Haswell μop . , , , .

, . , . , , , . Macro-op fusion, , .

Haswell . , .

256-load + 256-FMA    //one fused µop using two ports
256-load + 256-FMA    //one fused µop using two ports
256-store             //one µop using two ports
64-bit add + jump     //one µop using one port

, ​​Haswell ( FMA), 256-, 256- 64- . obtaining-peak-bandwidth-on-haswell-in-the-l1-cache-only-getting-62 () . .

, , , FMA . , , .

  1. , , 64 (2FMA * 4cores) . - , .

, Intel , . . . , Intel (, AVX, FMA, AVX512, ), , , , , .

+7

All Articles