Packing 10-bit values ​​into a byte stream with SIMD

I am trying to pack 10-bit pixels into a continuous stream of bytes using SIMD instructions. The code below does this “in principle”, but the SIMD version is slower than the scalar version.

The problem is that I cannot find good collection / scatter operations that load the register efficiently.

Any suggestions for improvement?

// SIMD_test.cpp : Defines the entry point for the console application. // #include "stdafx.h" #include "Windows.h" #include <tmmintrin.h> #include <stdint.h> #include <string.h> // reference non-SIMD implementation that "works" // 4 uint16 at a time as input, and 5 uint8 as output per loop iteration void packSlow(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL) { for(uint32_t j=0;j<NCOL;j+=4) { streamBuffer[0] = (uint8_t)(ptr[0]); streamBuffer[1] = (uint8_t)(((ptr[0]&0x3FF)>>8) | ((ptr[1]&0x3F) <<2)); streamBuffer[2] = (uint8_t)(((ptr[1]&0x3FF)>>6) | ((ptr[2]&0x0F) <<4)); streamBuffer[3] = (uint8_t)(((ptr[2]&0x3FF)>>4) | ((ptr[3]&0x03) <<6)); streamBuffer[4] = (uint8_t)((ptr[3]&0x3FF)>>2) ; streamBuffer += 5; ptr += 4; } } // poorly written SIMD implementation. Attempts to do the same // as the packSlow, but 8 iterations at a time void packFast(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL) { const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF); const __m128i maskb = _mm_set_epi16(0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F); const __m128i maskc = _mm_set_epi16(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F); const __m128i maskd = _mm_set_epi16(0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03); for(uint32_t j=0;j<NCOL;j+=4*8) { _mm_prefetch((const char*)(ptr+j),_MM_HINT_T0); } for(uint32_t j=0;j<NCOL;j+=4*8) { // this "fetch" stage is costly. Each term takes 2 cycles __m128i ptr0 = _mm_set_epi16(ptr[0],ptr[4],ptr[8],ptr[12],ptr[16],ptr[20],ptr[24],ptr[28]); __m128i ptr1 = _mm_set_epi16(ptr[1],ptr[5],ptr[9],ptr[13],ptr[17],ptr[21],ptr[25],ptr[29]); __m128i ptr2 = _mm_set_epi16(ptr[2],ptr[6],ptr[10],ptr[14],ptr[18],ptr[22],ptr[26],ptr[30]); __m128i ptr3 = _mm_set_epi16(ptr[3],ptr[7],ptr[11],ptr[15],ptr[19],ptr[23],ptr[27],ptr[31]); // I think this part is fairly well optimized __m128i streamBuffer0 = ptr0; __m128i streamBuffer1 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr0 , maska), _mm_set_epi32(0, 0, 0,8)) , _mm_sll_epi16 (_mm_and_si128 (ptr1 , maskb) , _mm_set_epi32(0, 0, 0,2))); __m128i streamBuffer2 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr1 , maska), _mm_set_epi32(0, 0, 0,6)) , _mm_sll_epi16 (_mm_and_si128 (ptr2 , maskc) , _mm_set_epi32(0, 0, 0,4))); __m128i streamBuffer3 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr2 , maska), _mm_set_epi32(0, 0, 0,4)) , _mm_sll_epi16 (_mm_and_si128 (ptr3 , maskd) , _mm_set_epi32(0, 0, 0,6))); __m128i streamBuffer4 = _mm_srl_epi16 (_mm_and_si128 (ptr3 , maska), _mm_set_epi32(0, 0, 0,2)) ; // this again is terribly slow. ~2 cycles per byte output for(int j=15;j>=0;j-=2) { streamBuffer[0] = streamBuffer0.m128i_u8[j]; streamBuffer[1] = streamBuffer1.m128i_u8[j]; streamBuffer[2] = streamBuffer2.m128i_u8[j]; streamBuffer[3] = streamBuffer3.m128i_u8[j]; streamBuffer[4] = streamBuffer4.m128i_u8[j]; streamBuffer += 5; } ptr += 32; } } int _tmain(int argc, _TCHAR* argv[]) { uint16_t pixels[512]; uint8_t packed1[512*10/8]; uint8_t packed2[512*10/8]; for(int i=0;i<512;i++) { pixels[i] = i; } LARGE_INTEGER t0,t1,t2; QueryPerformanceCounter(&t0); for(int k=0;k<1000;k++) packSlow(pixels,packed1,512); QueryPerformanceCounter(&t1); for(int k=0;k<1000;k++) packFast(pixels,packed2,512); QueryPerformanceCounter(&t2); printf("%d %d\n",t1.QuadPart-t0.QuadPart,t2.QuadPart-t1.QuadPart); if (memcmp(packed1,packed2,sizeof(packed1))) { printf("failed\n"); } return 0; } 
+5
c ++ x86 bit-manipulation simd
source share
3 answers

When you read the code again, it looks like you will almost certainly kill your load / store block, which will not even be completely relieved with the new AVX2 VGATHER[D/Q]P[D/S] command family. Even Haswell's architecture still requires the use of uop for each load element, each of which falls into LLD TLB and cache, regardless of location, with an improvement in efficiency that manifests itself in Skylake ca. 2016 as early as possible.

The best thing to use at the moment is, perhaps, to write to 16B and manually create your streamBuffer values ​​using register copies, calls to _mm_shuffle_epi8() and _mm_or_si128() and the _mm_or_si128() for finish stores.

In the near future, AVX2 will provide (and does it for newer desktop computers already) VPS[LL/RL/RA]V[D/Q] instructions that allow you to change the offset of an element, which, combined with horizontal addition, can make this package pretty fast. In this case, you can use simple MOVDQU instructions to load your values, since you can process adjacent uint16_t input values ​​in the same xmm register.

Also, consider reprocessing your prefetch. Your j loop in NCOL processes the 64B / 1 cache line at a time, so you need to do one prefetch for ptr + 32 at the beginning of your second loop body. You can even refuse it, as this is a simple preliminary scan that the hardware prefetcher will detect and automate for you after a very small number of iterations anyway.

+3
source share

I have no experience with SSE. But I would try to optimize the code as follows.

 // warning. This routine requires streamBuffer to have at least 3 extra spare bytes // at the end to be used as scratch space. It will write 0 to those bytes. // for example, streamBuffer needs to be 640+3 bytes of allocated memory if // 512 10-bit samples are output. void packSlow1(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL) { for(uint32_t j=0;j<NCOL;j+=4*4) { uint64_t *dst; uint64_t src[4][4]; // __m128i s01 = _mm_set_epi64(ptr[0], ptr[1]); // __m128i s23 = _mm_set_epi64(ptr[2], ptr[3]); // ---- or ---- // __m128i s0123 = _mm_load_si128(ptr[0]) // __m128i s01 = _?????_(s0123) // some instruction to extract s01 from s0123 // __m128i s23 = _?????_(s0123) // some instruction to extract s23 src[0][0] = ptr[0] & 0x3ff; src[0][1] = ptr[1] & 0x3ff; src[0][2] = ptr[2] & 0x3ff; src[0][3] = ptr[3] & 0x3ff; src[1][0] = ptr[4] & 0x3ff; src[1][1] = ptr[5] & 0x3ff; src[1][2] = ptr[6] & 0x3ff; src[1][3] = ptr[7] & 0x3ff; src[2][0] = ptr[8] & 0x3ff; src[2][1] = ptr[9] & 0x3ff; src[2][2] = ptr[10] & 0x3ff; src[2][3] = ptr[11] & 0x3ff; src[3][0] = ptr[12] & 0x3ff; src[3][1] = ptr[13] & 0x3ff; src[3][2] = ptr[14] & 0x3ff; src[3][3] = ptr[15] & 0x3ff; // looks like _mm_maskmoveu_si128 can store result efficiently dst = (uint64_t*)streamBuffer; dst[0] = src[0][0] | (src[0][1] << 10) | (src[0][2] << 20) | (src[0][3] << 30); dst = (uint64_t*)(streamBuffer + 5); dst[0] = src[1][0] | (src[1][1] << 10) | (src[1][2] << 20) | (src[1][3] << 30); dst = (uint64_t*)(streamBuffer + 10); dst[0] = src[2][0] | (src[2][1] << 10) | (src[2][2] << 20) | (src[2][3] << 30); dst = (uint64_t*)(streamBuffer + 15); dst[0] = src[3][0] | (src[3][1] << 10) | (src[3][2] << 20) | (src[3][3] << 30); streamBuffer += 5 * 4; ptr += 4 * 4; } } 

UPDATE:

Landmarks:

 Ubuntu 12.04, x86_64 GNU/Linux, gcc v4.6.3 (Virtual Box) Intel Core i7 (Macbook pro) compiled with -O3 5717633386 (1X): packSlow 3868744491 (1.4X): packSlow1 (version from the post) 4471858853 (1.2X): packFast2 (from Mark Lakata post) 1820784764 (3.1X): packFast3 (version from the post) Windows 8.1, x64, VS2012 Express Intel Core i5 (Asus) compiled with standard 'Release' options and SSE2 enabled 00413185 (1X) packSlow 00782005 (0.5X) packSlow1 00236639 (1.7X) packFast2 00148906 (2.8X) packFast3 

I see completely different results on an Asus laptop with Windows 8.1 and VS Express 2012 (code compiled with -O2). packSlow1 is 2 times slower than the original packSlow, while packFast2 is 1.7X (not 2.9X) faster than packSlow. Having studied this problem, I understood the reason. The VC compiler was unable to save all the constants in the XMMS registers for packFast2, so it inserted additional memory accesses into the loop (see Generated assembly). Low memory access explains performance degradation.

To get more stable results, I increased the pixel buffer to 256x512 and increased the cycle counter from 1000 to 10000000/256.

Here is my version of the optimized SSE feature.

 // warning. This routine requires streamBuffer to have at least 3 extra spare bytes // at the end to be used as scratch space. It will write 0 to those bytes. // for example, streamBuffer needs to be 640+3 bytes of allocated memory if // 512 10-bit samples are output. void packFast3(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL) { const __m128i m0 = _mm_set_epi16(0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF); const __m128i m1 = _mm_set_epi16(0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0); const __m128i m2 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF); const __m128i m3 = _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0); const __m128i m4 = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); const __m128i m5 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0); __m128i s0, t0, r0, x0, x1; // unrolled and normal loop gives the same result for(uint32_t j=0;j<NCOL;j+=8) { // load 8 samples into s0 s0 = _mm_loadu_si128((__m128i*)ptr); // s0=00070006_00050004_00030002_00010000 // join 16-bit samples into 32-bit words x0 = _mm_and_si128(s0, m0); // x0=00000006_00000004_00000002_00000000 x1 = _mm_and_si128(s0, m1); // x1=00070000_00050000_00030000_00010000 t0 = _mm_or_si128(x0, _mm_srli_epi32(x1, 6)); // t0=00001c06_00001404_00000c02_00000400 // join 32-bit words into 64-bit dwords x0 = _mm_and_si128(t0, m2); // x0=00000000_00001404_00000000_00000400 x1 = _mm_and_si128(t0, m3); // x1=00001c06_00000000_00000c02_00000000 t0 = _mm_or_si128(x0, _mm_srli_epi64(x1, 12)); // t0=00000001_c0601404_00000000_c0200400 // join 64-bit dwords x0 = _mm_and_si128(t0, m4); // x0=00000000_00000000_00000000_c0200400 x1 = _mm_and_si128(t0, m5); // x1=00000001_c0601404_00000000_00000000 r0 = _mm_or_si128(x0, _mm_srli_si128(x1, 3)); // r0=00000000_000001c0_60140400_c0200400 // and store result _mm_storeu_si128((__m128i*)streamBuffer, r0); streamBuffer += 10; ptr += 8; } } 
+3
source share

I came up with a “better” solution using SIMD, but it does not use parallelization, just more efficient loads and stores (I think).

I post it here for reference, not necessarily the best answer.

Tests (in arbitrary ticks)

  gcc4.8.1 -O3 VS2012 /O2 Implementation ----------------------------------------- 369 (1X) 3394 (1X) packSlow (original code) 212 (1.7X) 2010 (1.7X) packSlow (from @alexander) 147 (2.5X) 1178 (2.9X) packFast2 (below) 

Here is the code. In fact, the @alexander code excludes the use of 128-bit registers instead of 64-bit registers and expands 2x instead of 4x.

 void packFast2(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL) { const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF); const __m128i mask0 = _mm_set_epi16(0,0,0,0,0,0,0,0x3FF); const __m128i mask1 = _mm_set_epi16(0,0,0,0,0,0,0x3FF,0); const __m128i mask2 = _mm_set_epi16(0,0,0,0,0,0x3FF,0,0); const __m128i mask3 = _mm_set_epi16(0,0,0,0,0x3FF,0,0,0); const __m128i mask4 = _mm_set_epi16(0,0,0,0x3FF,0,0,0,0); const __m128i mask5 = _mm_set_epi16(0,0,0x3FF,0,0,0,0,0); const __m128i mask6 = _mm_set_epi16(0,0x3FF,0,0,0,0,0,0); const __m128i mask7 = _mm_set_epi16(0x3FF,0,0,0,0,0,0,0); for(uint32_t j=0;j<NCOL;j+=16) { __m128i s = _mm_load_si128((__m128i*)ptr); // load 8 16 bit values __m128i s2 = _mm_load_si128((__m128i*)(ptr+8)); // load 8 16 bit values __m128i a = _mm_and_si128(s,mask0); a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask1),6)); a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask2),12)); a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask3),18)); a = _mm_or_si128( a, _mm_srli_si128 (_mm_and_si128(s, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8) a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes. a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits). a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask7),42)); _mm_storeu_si128((__m128i*)streamBuffer, a); __m128i a2 = _mm_and_si128(s2,mask0); a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask1),6)); a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask2),12)); a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask3),18)); a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_and_si128(s2, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8) a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes. a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits). a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask7),42)); _mm_storeu_si128((__m128i*)(streamBuffer+10), a2); streamBuffer += 20 ; ptr += 16 ; } } 
+2
source share

All Articles