When compiling this program with arm-elf-gcc-4.5 -O3 -march = armv7-a -mthumb -mfpu = neon -mfloat-abi = softfp:
#include <arm_neon.h> extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) { float32x4x2_t xxyyzz1(vzipq_f32(v1, v1)), xxyyzz2(vzipq_f32(v2, v2)); float32x2_t xx1(vget_low_f32(xxyyzz1.val[0])), yy1(vget_high_f32(xxyyzz1.val[0])), zz1(vget_low_f32(xxyyzz1.val[1])), xx2(vget_low_f32(xxyyzz2.val[0])), yy2(vget_high_f32(xxyyzz2.val[0])), zz2(vget_low_f32(xxyyzz2.val[1])); float32x2_t x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)), y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)), z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2)); return vcombine_f32(vuzp_f32(x, y).val[0], z); }
... this is what I get. Note the two useless instructions marked with the symbol @ <<lt;
_Z5crossRK19__simd128_float32_tS1_: vldmia r0, {d16-d17} vldmia r1, {d22-d23} vmov q10, q8 @ v4sf vmov q9, q11 @ v4sf vzip.32 q8, q10 vzip.32 q11, q9 vmov d24, d17 vmov d21, d22 vmov d22, d23 vmul.f32 d17, d24, d18 vmul.f32 d19, d20, d21 vmls.f32 d19, d16, d18 vmls.f32 d17, d20, d22 vmul.f32 d16, d16, d22 vuzp.32 d17, d19 vmls.f32 d16, d24, d21 sub sp, sp, #80 @<<< vswp d17, d16 vmov r0, r1, d16 @ v4sf vmov r2, r3, d17 add sp, sp, #80 @<<< bx
The stack never opens, but the stack pointer gets decremented and then incremented by the same amount. Why?
If I change the source code to include asm comment at the end of the prolog and the beginning of the epilogue, for example:
#include <arm_neon.h> extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) { asm volatile("# End of prologue"); float32x4x2_t xxyyzz1(vzipq_f32(v1, v1)), xxyyzz2(vzipq_f32(v2, v2)); float32x2_t xx1(vget_low_f32(xxyyzz1.val[0])), yy1(vget_high_f32(xxyyzz1.val[0])), zz1(vget_low_f32(xxyyzz1.val[1])), xx2(vget_low_f32(xxyyzz2.val[0])), yy2(vget_high_f32(xxyyzz2.val[0])), zz2(vget_low_f32(xxyyzz2.val[1])); float32x2_t x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)), y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)), z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2)); float32x4_t res(vcombine_f32(vuzp_f32(x, y).val[0], z)); asm volatile("# Start of epilogue"); return res; }
Then I get a slightly different version:
_Z5crossRK19__simd128_float32_tS1_: sub sp, sp, #80 # End of prologue vldmia r0, {d16-d17} vldmia r1, {d22-d23} vmov q10, q8 @ v4sf vmov q9, q11 @ v4sf vzip.32 q8, q10 vzip.32 q11, q9 vmov d24, d17 vmov d21, d22 vmov d22, d23 vmul.f32 d17, d24, d18 vmul.f32 d19, d20, d21 vmls.f32 d19, d16, d18 vmls.f32 d17, d20, d22 vmul.f32 d16, d16, d22 vuzp.32 d17, d19 vmls.f32 d16, d24, d21 vswp d17, d16 # Start of epilogue vmov r0, r1, d16 @ v4sf vmov r2, r3, d17 add sp, sp, #80 bx lr
Reducing / increasing the stack pointer is clearly part of the prolog / epilogue and occurs even if the stack is not used. Should this conform to some standard, or is this a bug of gcc optimization?
EDIT: The compiler is arm-elf-gcc-4.5 (GCC) 4.5.0, configured with: /opt/local/var/macports/build/_opt_local_var_macports_sources_rsync.macports.org_release_ports_cross_arm-elf- gcc / work / gcc-4.5.0 / configure --prefix = / opt / local --infodir = / opt / local / share / info --mandir = / opt / local / share / man --target = arm-elf --program-prefix = arm-elf - --program-suffix = -4.5 - without enabling-gettext --enable-obsolete --with-newlib --disable -__ cxa_atexit --enable-multilib --enable-biendian --disable-libgfortran --with-gxx -include-dir = / opt / local / arm-elf / include / C ++ / 4.5.0 / - enable-languages = c, C ++, objc -build = x86_64-apple -darwin10 --enable -fpu
EDIT: I was able to identify the problem using the following source C. This only happens when using arrays of vector types as temporary, such as float32x4x2_t, which are declared as struct { float32x4_t val[2]; } struct { float32x4_t val[2]; } , even these temporary registers are made. I believe that this is a mistake, so I reported this .
#include <arm_neon.h> // This one is ok extern float32x4_t add(float32x4_t* v1, float32x4_t* v2) { return vaddq_f32(*v1, *v2); #if 0 produced assembly: add: vldmia r0, {d16-d17} vldmia r1, {d18-d19} vadd.f32 q8, q8, q9 vmov r0, r1, d16 vmov r2, r3, d17 bx lr #endif } // This one uses float32x4x2_t temporaries and has the bug extern float32x4_t cross(float32x4_t* v1, float32x4_t* v2) { float32x4x2_t xxyyzz1=vzipq_f32(*v1, *v1), xxyyzz2=vzipq_f32(*v2, *v2); float32x2_t xx1=vget_low_f32(xxyyzz1.val[0]), yy1=vget_high_f32(xxyyzz1.val[0]), zz1=vget_low_f32(xxyyzz1.val[1]), xx2=vget_low_f32(xxyyzz2.val[0]), yy2=vget_high_f32(xxyyzz2.val[0]), zz2=vget_low_f32(xxyyzz2.val[1]); float32x2_t x=vmls_f32(vmul_f32(yy1, zz2), zz1, yy2), y=vmls_f32(vmul_f32(zz1, xx2), xx1, zz2), z=vmls_f32(vmul_f32(xx1, yy2), yy1, xx2); return vcombine_f32(vuzp_f32(x, y).val[0], z); #if 0 produced assembly: cross: vldmia r0, {d18-d19} vldmia r1, {d16-d17} vmov q10, q9 vmov q11, q8 vzip.32 q9, q10 vzip.32 q8, q11 vmov d24, d19 vmov d21, d16 vmov d16, d17 vmul.f32 d19, d20, d21 vmul.f32 d17, d24, d22 vmls.f32 d17, d20, d16 vmls.f32 d19, d18, d22 vmul.f32 d16, d18, d16 vuzp.32 d17, d19 vmls.f32 d16, d24, d21 sub sp, sp, #48 @ here vswp d17, d16 vmov r0, r1, d16 vmov r2, r3, d17 add sp, sp, #48 @ and here bx lr #endif }