I am trying to pack 10-bit pixels into a continuous stream of bytes using SIMD instructions. The code below does this “in principle”, but the SIMD version is slower than the scalar version.
The problem is that I cannot find good collection / scatter operations that load the register efficiently.
Any suggestions for improvement?
// SIMD_test.cpp : Defines the entry point for the console application. // #include "stdafx.h" #include "Windows.h" #include <tmmintrin.h> #include <stdint.h> #include <string.h> // reference non-SIMD implementation that "works" // 4 uint16 at a time as input, and 5 uint8 as output per loop iteration void packSlow(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL) { for(uint32_t j=0;j<NCOL;j+=4) { streamBuffer[0] = (uint8_t)(ptr[0]); streamBuffer[1] = (uint8_t)(((ptr[0]&0x3FF)>>8) | ((ptr[1]&0x3F) <<2)); streamBuffer[2] = (uint8_t)(((ptr[1]&0x3FF)>>6) | ((ptr[2]&0x0F) <<4)); streamBuffer[3] = (uint8_t)(((ptr[2]&0x3FF)>>4) | ((ptr[3]&0x03) <<6)); streamBuffer[4] = (uint8_t)((ptr[3]&0x3FF)>>2) ; streamBuffer += 5; ptr += 4; } } // poorly written SIMD implementation. Attempts to do the same // as the packSlow, but 8 iterations at a time void packFast(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL) { const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF); const __m128i maskb = _mm_set_epi16(0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F); const __m128i maskc = _mm_set_epi16(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F); const __m128i maskd = _mm_set_epi16(0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03); for(uint32_t j=0;j<NCOL;j+=4*8) { _mm_prefetch((const char*)(ptr+j),_MM_HINT_T0); } for(uint32_t j=0;j<NCOL;j+=4*8) { // this "fetch" stage is costly. Each term takes 2 cycles __m128i ptr0 = _mm_set_epi16(ptr[0],ptr[4],ptr[8],ptr[12],ptr[16],ptr[20],ptr[24],ptr[28]); __m128i ptr1 = _mm_set_epi16(ptr[1],ptr[5],ptr[9],ptr[13],ptr[17],ptr[21],ptr[25],ptr[29]); __m128i ptr2 = _mm_set_epi16(ptr[2],ptr[6],ptr[10],ptr[14],ptr[18],ptr[22],ptr[26],ptr[30]); __m128i ptr3 = _mm_set_epi16(ptr[3],ptr[7],ptr[11],ptr[15],ptr[19],ptr[23],ptr[27],ptr[31]); // I think this part is fairly well optimized __m128i streamBuffer0 = ptr0; __m128i streamBuffer1 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr0 , maska), _mm_set_epi32(0, 0, 0,8)) , _mm_sll_epi16 (_mm_and_si128 (ptr1 , maskb) , _mm_set_epi32(0, 0, 0,2))); __m128i streamBuffer2 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr1 , maska), _mm_set_epi32(0, 0, 0,6)) , _mm_sll_epi16 (_mm_and_si128 (ptr2 , maskc) , _mm_set_epi32(0, 0, 0,4))); __m128i streamBuffer3 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr2 , maska), _mm_set_epi32(0, 0, 0,4)) , _mm_sll_epi16 (_mm_and_si128 (ptr3 , maskd) , _mm_set_epi32(0, 0, 0,6))); __m128i streamBuffer4 = _mm_srl_epi16 (_mm_and_si128 (ptr3 , maska), _mm_set_epi32(0, 0, 0,2)) ; // this again is terribly slow. ~2 cycles per byte output for(int j=15;j>=0;j-=2) { streamBuffer[0] = streamBuffer0.m128i_u8[j]; streamBuffer[1] = streamBuffer1.m128i_u8[j]; streamBuffer[2] = streamBuffer2.m128i_u8[j]; streamBuffer[3] = streamBuffer3.m128i_u8[j]; streamBuffer[4] = streamBuffer4.m128i_u8[j]; streamBuffer += 5; } ptr += 32; } } int _tmain(int argc, _TCHAR* argv[]) { uint16_t pixels[512]; uint8_t packed1[512*10/8]; uint8_t packed2[512*10/8]; for(int i=0;i<512;i++) { pixels[i] = i; } LARGE_INTEGER t0,t1,t2; QueryPerformanceCounter(&t0); for(int k=0;k<1000;k++) packSlow(pixels,packed1,512); QueryPerformanceCounter(&t1); for(int k=0;k<1000;k++) packFast(pixels,packed2,512); QueryPerformanceCounter(&t2); printf("%d %d\n",t1.QuadPart-t0.QuadPart,t2.QuadPart-t1.QuadPart); if (memcmp(packed1,packed2,sizeof(packed1))) { printf("failed\n"); } return 0; }