- SSE , .
temp.re = z.re * z.re - z.im * z.im;
temp.im = 2.0 * z.re * z.im;
z.re = temp.re + c.re;
z.im = temp.im + c.im;
, (_mm_mul_ps) (_mm_hadd_ps).
, , .
, x86 FPU, SSE - , , .
SSE - , - , , gcc-O3, gcc SSE :
static Complex loop_simd(const Complex z0, const Complex c, const int n)
{
__m128 vz = _mm_set_ps(z0.im, z0.re, z0.im, z0.re);
const __m128 vc = _mm_set_ps(0.0f, 0.0f, c.im, c.re);
const __m128 vs = _mm_set_ps(0.0f, 0.0f, -0.0f, 0.0f);
Complex z[2];
int i;
for (i = 0; i < n; ++i)
{
__m128 vtemp;
vtemp = _mm_shuffle_ps(vz, vz, 0x16);
vtemp = _mm_xor_ps(vtemp, vs);
vtemp = _mm_mul_ps(vtemp, vz);
vtemp = _mm_hadd_ps(vtemp, vtemp);
vz = _mm_add_ps(vtemp, vc);
}
_mm_storeu_ps(&z[0].re, vz);
return z[0];
}
, - 6 SSE ( 5) + :
L4:
movaps %xmm0, %xmm1
shufps $22, %xmm0, %xmm1
xorps %xmm3, %xmm1
mulps %xmm1, %xmm0
haddps %xmm0, %xmm0
addps %xmm2, %xmm0
incl %eax
cmpl %edi, %eax
jne L4
L2: