Now that I woke up, here is my answer:
The source code bottleneck is almost certainly _mm_set_epi32 . This single inline compiles into this mess in your assembly:
633415EC xor edi,edi 633415EE movd xmm3,edi ... 633415F6 xor ebx,ebx 633415F8 movd xmm4,edi 633415FC movd xmm5,ebx 63341600 movd xmm0,esi ... 6334160B punpckldq xmm5,xmm3 6334160F punpckldq xmm0,xmm4 ... 63341618 punpckldq xmm0,xmm5
What is it? 9 instructions?!?!?! Net overhead ...
Another place that seems odd is that the compiler did not combine add and load:
movdqa xmm3,xmmword ptr [ecx-10h] paddq xmm0,xmm3
should be combined with:
paddq xmm0,xmmword ptr [ecx-10h]
I'm not sure if the compiler is crazy, or if it really had a legitimate reason for this ... Anyway, this is a small thing compared to _mm_set_epi32 .
Disclaimer: The code that I will be here violates strict anti-aliasing. But to achieve maximum performance, non-standard methods are often required.
Solution 1: no venereisation
This solution assumes that allZero indeed all zeros.
The cycle is actually simpler than it looks. Since there are not many arithmetic, itโs better not to vectorize it:
// Test Data unsigned __int32 fragmentCoefficentVector = 1000000000; __declspec(align(16)) int currentMessageGaloisFieldsArray_[8] = {10,11,12,13,14,15,16,17}; int *currentMessageGaloisFieldsArray = currentMessageGaloisFieldsArray_; __m128i currentUnModdedGaloisFieldFragments_[8]; __m128i *currentUnModdedGaloisFieldFragments = currentUnModdedGaloisFieldFragments_; memset(currentUnModdedGaloisFieldFragments,0,8 * sizeof(__m128i)); int elementIterations = 4; // The Loop while (elementIterations > 0){ elementIterations -= 1; // Default 32 x 32 -> 64-bit multiply code unsigned __int64 r0 = currentMessageGaloisFieldsArray[0] * (unsigned __int64)fragmentCoefficentVector; unsigned __int64 r1 = currentMessageGaloisFieldsArray[1] * (unsigned __int64)fragmentCoefficentVector; // Use this for Visual Studio. VS doesn't know how to optimize 32 x 32 -> 64-bit multiply // unsigned __int64 r0 = __emulu(currentMessageGaloisFieldsArray[0], fragmentCoefficentVector); // unsigned __int64 r1 = __emulu(currentMessageGaloisFieldsArray[1], fragmentCoefficentVector); ((__int64*)currentUnModdedGaloisFieldFragments)[0] += r0 & 0x00000000ffffffff; ((__int64*)currentUnModdedGaloisFieldFragments)[1] += r0 >> 32; ((__int64*)currentUnModdedGaloisFieldFragments)[2] += r1 & 0x00000000ffffffff; ((__int64*)currentUnModdedGaloisFieldFragments)[3] += r1 >> 32; currentMessageGaloisFieldsArray += 2; currentUnModdedGaloisFieldFragments += 2; }
What compiles for this on x64:
$LL4@main: mov ecx, DWORD PTR [rbx] mov rax, r11 add r9, 32 ; 00000020H add rbx, 8 mul rcx mov ecx, DWORD PTR [rbx-4] mov r8, rax mov rax, r11 mul rcx mov ecx, r8d shr r8, 32 ; 00000020H add QWORD PTR [r9-48], rcx add QWORD PTR [r9-40], r8 mov ecx, eax shr rax, 32 ; 00000020H add QWORD PTR [r9-24], rax add QWORD PTR [r9-32], rcx dec r10 jne SHORT $LL4@main
and this is on x86:
$LL4@main: mov eax, DWORD PTR [esi] mul DWORD PTR _fragmentCoefficentVector$[esp+224] mov ebx, eax mov eax, DWORD PTR [esi+4] mov DWORD PTR _r0$31463[esp+228], edx mul DWORD PTR _fragmentCoefficentVector$[esp+224] add DWORD PTR [ecx-16], ebx mov ebx, DWORD PTR _r0$31463[esp+228] adc DWORD PTR [ecx-12], edi add DWORD PTR [ecx-8], ebx adc DWORD PTR [ecx-4], edi add DWORD PTR [ecx], eax adc DWORD PTR [ecx+4], edi add DWORD PTR [ecx+8], edx adc DWORD PTR [ecx+12], edi add esi, 8 add ecx, 32 ; 00000020H dec DWORD PTR tv150[esp+224] jne SHORT $LL4@main
It's possible that both of them are already faster than your source code (SSE) ... In x64, Unrolling this will make it even better.
Solution 2: Integer Move SSE2
This solution expands the loop in 2 iterations:
// Test Data __m128i allZero = _mm_setzero_si128(); __m128i fragmentCoefficentVector = _mm_set1_epi32(1000000000); __declspec(align(16)) int currentMessageGaloisFieldsArray_[8] = {10,11,12,13,14,15,16,17}; int *currentMessageGaloisFieldsArray = currentMessageGaloisFieldsArray_; __m128i currentUnModdedGaloisFieldFragments_[8]; __m128i *currentUnModdedGaloisFieldFragments = currentUnModdedGaloisFieldFragments_; memset(currentUnModdedGaloisFieldFragments,0,8 * sizeof(__m128i)); int elementIterations = 4; // The Loop while(elementIterations > 1){ elementIterations -= 2; // Load 4 elements. If needed use unaligned load instead. // messageField = {a, b, c, d} __m128i messageField = _mm_load_si128((__m128i*)currentMessageGaloisFieldsArray); // Get into this form: // values0 = {a, x, b, x} // values1 = {c, x, d, x} __m128i values0 = _mm_shuffle_epi32(messageField,216); __m128i values1 = _mm_shuffle_epi32(messageField,114); // Multiply by "fragmentCoefficentVector" values0 = _mm_mul_epu32(values0, fragmentCoefficentVector); values1 = _mm_mul_epu32(values1, fragmentCoefficentVector); __m128i halves0 = _mm_unpacklo_epi32(values0, allZero); __m128i halves1 = _mm_unpackhi_epi32(values0, allZero); __m128i halves2 = _mm_unpacklo_epi32(values1, allZero); __m128i halves3 = _mm_unpackhi_epi32(values1, allZero); halves0 = _mm_add_epi64(halves0, currentUnModdedGaloisFieldFragments[0]); halves1 = _mm_add_epi64(halves1, currentUnModdedGaloisFieldFragments[1]); halves2 = _mm_add_epi64(halves2, currentUnModdedGaloisFieldFragments[2]); halves3 = _mm_add_epi64(halves3, currentUnModdedGaloisFieldFragments[3]); currentUnModdedGaloisFieldFragments[0] = halves0; currentUnModdedGaloisFieldFragments[1] = halves1; currentUnModdedGaloisFieldFragments[2] = halves2; currentUnModdedGaloisFieldFragments[3] = halves3; currentMessageGaloisFieldsArray += 4; currentUnModdedGaloisFieldFragments += 4; }
which compiled to this (x86): (x64 is not too different)
$LL4@main: movdqa xmm1, XMMWORD PTR [esi] pshufd xmm0, xmm1, 216 ; 000000d8H pmuludq xmm0, xmm3 movdqa xmm4, xmm0 punpckhdq xmm0, xmm2 paddq xmm0, XMMWORD PTR [eax-16] pshufd xmm1, xmm1, 114 ; 00000072H movdqa XMMWORD PTR [eax-16], xmm0 pmuludq xmm1, xmm3 movdqa xmm0, xmm1 punpckldq xmm4, xmm2 paddq xmm4, XMMWORD PTR [eax-32] punpckldq xmm0, xmm2 paddq xmm0, XMMWORD PTR [eax] punpckhdq xmm1, xmm2 paddq xmm1, XMMWORD PTR [eax+16] movdqa XMMWORD PTR [eax-32], xmm4 movdqa XMMWORD PTR [eax], xmm0 movdqa XMMWORD PTR [eax+16], xmm1 add esi, 16 ; 00000010H add eax, 64 ; 00000040H dec ecx jne SHORT $LL4@main
Only slightly longer than the non-vectorized version for two iterations. This uses very few registers, so you can deploy it even on x86.
Explanations:
- As Paul R said, deploying to two iterations allows you to combine the initial load into one SSE load. It also makes it possible to get your data into SSE registers.
- Since data starts in SSE registers,
_mm_set_epi32 (which is compiled into ~ 9 instructions in the source code), you can replace it with one _mm_shuffle_epi32 .