Effective complex arithmetic in x86 assembly

Consider the following program:

for i=1 to 10000000 do
  z <- z*z + c

where zand care complex numbers.

What are the effective implementations of x86 assembler for this program using x87 against SSE and double precision single arithmetic?

EDIT I know that I can write it in another language and trust the compiler to create the optimal machine code for me, but I do this to learn how to write the optimal x86 assembler myself. I have already reviewed the code generated gcc -O2, and I assume that there are many opportunities for improvement, but I am not good enough at writing the optimal x86 assembler manually, so I ask for help here.

+5
source share
3 answers

- SSE , .

temp.re = z.re * z.re - z.im * z.im;
temp.im = 2.0 * z.re * z.im;
z.re = temp.re + c.re;
z.im = temp.im + c.im;

, (_mm_mul_ps) (_mm_hadd_ps).

, , .

, x86 FPU, SSE - , , .


SSE - , - , , gcc-O3, gcc SSE :

static Complex loop_simd(const Complex z0, const Complex c, const int n)
{
    __m128 vz = _mm_set_ps(z0.im, z0.re, z0.im, z0.re);
    const __m128 vc = _mm_set_ps(0.0f, 0.0f, c.im, c.re);
    const __m128 vs = _mm_set_ps(0.0f, 0.0f, -0.0f, 0.0f);
    Complex z[2];
    int i;

    for (i = 0; i < n; ++i)
    {
        __m128 vtemp;

        vtemp = _mm_shuffle_ps(vz, vz, 0x16); // temp = { z.re, z.im, z.im, z.re }
        vtemp = _mm_xor_ps(vtemp, vs);        // temp = { z.re, -z.im, z.im, z.re }
        vtemp = _mm_mul_ps(vtemp, vz);        // temp = { z.re * z.re, - z.im * z.im, z.re * z.im, z.im * z.re }
        vtemp = _mm_hadd_ps(vtemp, vtemp);    // temp = { z.re * z.re - z.im * z.im, 2 * z.re * z.im, ... }
        vz = _mm_add_ps(vtemp, vc);           // temp = { z.re * z.re - z.im * z.im + c.re, 2 * z.re * z.im + c.im, ... }
    }
    _mm_storeu_ps(&z[0].re, vz);
    return z[0];
}

, - 6 SSE ( 5) + :

L4:
    movaps  %xmm0, %xmm1
    shufps  $22, %xmm0, %xmm1
    xorps   %xmm3, %xmm1
    mulps   %xmm1, %xmm0
    haddps  %xmm0, %xmm0
    addps   %xmm2, %xmm0
    incl    %eax
    cmpl    %edi, %eax
    jne L4
L2:
+5

. z c (, mandelbrot), SSE. Paul R, :

__m128 z_im, z_re, c_im, c_re; //Four z and c values packed
__m128 re = _mm_sub_ps(_mm_mul_ps(z_re, z_re), _mm_mul_ps(z_im, z_im));
__m128 im = _mm_mul_ps(z_re, z_im);
im = _mm_add_ps(im, im); // Multiply by two
z_re = _mm_add_ps(re, c_re);
z_im = _mm_add_ps(im, c_im);
+4

Z = Z * Z + C

.

, . Xaos Fractint.

+1

All Articles