How to multiply two quaternions with minimal instructions?

After some thought, I came up with the following code to multiply two quaternions using SSE:

#include <pmmintrin.h> /* SSE3 intrinsics */ /* multiplication of two quaternions (x, y, z, w) x (a, b, c, d) */ __m128 _mm_cross4_ps(__m128 xyzw, __m128 abcd) { /* The product of two quaternions is: */ /* (X,Y,Z,W) = (xd+yc-zb+wa, -xc+yd+za+wb, xb-ya+zd+wc, -xa-yb-zc+wd) */ __m128 wzyx = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(0,1,2,3)); __m128 baba = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(0,1,0,1)); __m128 dcdc = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(2,3,2,3)); /* variable names below are for parts of componens of result (X,Y,Z,W) */ /* nX stands for -X and similarly for the other components */ /* znxwy = (xb - ya, zb - wa, wd - zc, yd - xc) */ __m128 ZnXWY = _mm_hsub_ps(_mm_mul_ps(xyzw, baba), _mm_mul_ps(wzyx, dcdc)); /* xzynw = (xd + yc, zd + wc, wb + za, yb + xa) */ __m128 XZYnW = _mm_hadd_ps(_mm_mul_ps(xyzw, dcdc), _mm_mul_ps(wzyx, baba)); /* _mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)) */ /* = (xd + yc, zd + wc, wd - zc, yd - xc) */ /* _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)) */ /* = (zb - wa, xb - ya, yb + xa, wb + za) */ /* _mm_addsub_ps adds elements 1 and 3 and subtracts elements 0 and 2, so we get: */ /* _mm_addsub_ps(*, *) = (xd+yc-zb+wa, xb-ya+zd+wc, wd-zc+yb+xa, yd-xc+wb+za) */ __m128 XZWY = _mm_addsub_ps(_mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)), _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1))); /* now we only need to shuffle the components in place and return the result */ return _mm_shuffle_ps(XZWY, XZWY, _MM_SHUFFLE(2,1,3,0)); /* operations: 6 shuffles, 4 multiplications, 3 compound additions/subtractions */ } 

I expected the assembly to have a minimal amount of instructions. However, when I compile it to build with gcc -msse3 -S, the resulting function has 67 commands:

  .text .globl __Z13_mm_cross4_psU8__vectorfS_ __Z13_mm_cross4_psU8__vectorfS_: LFB594: pushq %rbp LCFI0: movq %rsp, %rbp LCFI1: subq $232, %rsp movaps %xmm0, -336(%rbp) movaps %xmm1, -352(%rbp) movaps -336(%rbp), %xmm0 movaps -336(%rbp), %xmm1 shufps $27, %xmm1, %xmm0 movaps %xmm0, -16(%rbp) movaps -352(%rbp), %xmm0 movaps -352(%rbp), %xmm1 shufps $17, %xmm1, %xmm0 movaps %xmm0, -32(%rbp) movaps -352(%rbp), %xmm0 movaps -352(%rbp), %xmm1 shufps $187, %xmm1, %xmm0 movaps %xmm0, -48(%rbp) movaps -16(%rbp), %xmm0 movaps %xmm0, -112(%rbp) movaps -48(%rbp), %xmm0 movaps %xmm0, -128(%rbp) movaps -128(%rbp), %xmm0 movaps -112(%rbp), %xmm1 mulps %xmm1, %xmm0 movaps -336(%rbp), %xmm1 movaps %xmm1, -144(%rbp) movaps -32(%rbp), %xmm1 movaps %xmm1, -160(%rbp) movaps -160(%rbp), %xmm1 movaps -144(%rbp), %xmm2 mulps %xmm2, %xmm1 movaps %xmm1, -176(%rbp) movaps %xmm0, -192(%rbp) movaps -176(%rbp), %xmm0 hsubps -192(%rbp), %xmm0 movaps %xmm0, -64(%rbp) movaps -16(%rbp), %xmm0 movaps %xmm0, -208(%rbp) movaps -32(%rbp), %xmm0 movaps %xmm0, -224(%rbp) movaps -224(%rbp), %xmm0 movaps -208(%rbp), %xmm1 mulps %xmm1, %xmm0 movaps -336(%rbp), %xmm1 movaps %xmm1, -240(%rbp) movaps -48(%rbp), %xmm1 movaps %xmm1, -256(%rbp) movaps -256(%rbp), %xmm1 movaps -240(%rbp), %xmm2 mulps %xmm2, %xmm1 movaps %xmm1, -272(%rbp) movaps %xmm0, -288(%rbp) movaps -272(%rbp), %xmm0 haddps -288(%rbp), %xmm0 movaps %xmm0, -80(%rbp) movaps -64(%rbp), %xmm0 movaps -80(%rbp), %xmm1 shufps $177, %xmm1, %xmm0 movaps -80(%rbp), %xmm1 movaps -64(%rbp), %xmm2 shufps $228, %xmm2, %xmm1 movaps %xmm1, -304(%rbp) movaps %xmm0, -320(%rbp) movaps -304(%rbp), %xmm0 addsubps -320(%rbp), %xmm0 movaps %xmm0, -96(%rbp) movaps -96(%rbp), %xmm0 movaps -96(%rbp), %xmm1 shufps $156, %xmm1, %xmm0 leave LCFI2: ret 

What am I doing wrong? There should be a better way to shuffle items around without using so many movaps instructions.

+8
c assembly sse multiplication quaternions
source share
2 answers

Nothing. If I compile the code using gcc -msse3 -O1 -S, I get the following:

  .text .align 4,0x90 .globl __Z13_mm_cross4_psU8__vectorfS_ __Z13_mm_cross4_psU8__vectorfS_: LFB644: movaps %xmm0, %xmm5 movaps %xmm1, %xmm3 movaps %xmm0, %xmm2 shufps $27, %xmm0, %xmm5 movaps %xmm5, %xmm4 shufps $17, %xmm1, %xmm3 shufps $187, %xmm1, %xmm1 mulps %xmm3, %xmm2 mulps %xmm1, %xmm4 mulps %xmm5, %xmm3 mulps %xmm1, %xmm0 hsubps %xmm4, %xmm2 haddps %xmm3, %xmm0 movaps %xmm2, %xmm1 shufps $177, %xmm0, %xmm1 shufps $228, %xmm2, %xmm0 addsubps %xmm1, %xmm0 shufps $156, %xmm0, %xmm0 ret 

Here are just 18 instructions. This is what I expected in the beginning. Unfortunately.

+3
source share

You might be interested in the Agner Fog C ++ Vector Class Library . It provides the Quaternion4f and Quaternion4d classes (including the * and *= operators, of course) implemented using the SSE2 and AVX instruction sets, respectively. The library is an open source project, so you can delve into the code and find a good implementation example to create your function.

In the future, you can refer to the Optimizing routines in assembly language guide and provide an optimized, clean assembly of the implementation of the function or, aware of some low-level tricks, try redesigning the intrinsics approach in C.

+2
source share

All Articles