The fastest way to decompress bits into single point floats

This is a platform specific issue. Speed ​​is critical. What is the fastest way to decompress a byte into an array of 8 single precision floats so that zeros are mapped to zeros and mapped to units?

I ended up using 8-bit masks and 7-bit shifts to decompress into 8 int32, and then an AVX instruction to convert int32 to float.

My platform is a 64-bit version of Windows running on an AVX processor (but without AVX2). Compiler: Visual Studio 2013.

Thank.

+4
source share
4 answers

, , , . , AVX. AVX2 ymm ( ), xmm-, vinsertf128 , . , , , xmm VEX ( "v" , ).

, , dword, . , 0f 1f.

, , , eax, :

vmovd xmm0, eax
vpshufd xmm0, xmm0, 0

:

vpand xmm0, xmm0, [low_mask]
vpand xmm1, xmm0, [high_mask]

1, 2, 4, 8 16, 32, 64, 128 ( , _mm_set_epi32, )

:

vpxor xmm2, xmm2, xmm2
vpcmpgtd xmm0, xmm0, xmm2
vpcmpgtd xmm1, xmm1, xmm2

Merge:

vinsertf128 ymm0, ymm0, xmm1, 1

0f 1f:

vandps ymm0, ymm0, [ones]

ones 8 .

, , . , .

intrinsics, , ( ). , , VEX, .

// broadcast
__m128i low = _mm_set1_epi32(mask);
__m128i high = _mm_set1_epi32(mask);
// extract bits
low = _mm_and_si128(low, _mm_set_epi32(8, 4, 2, 1));
high = _mm_and_si128(high, _mm_set_epi32(128, 64, 32, 16));
// form masks
low = _mm_cmpgt_epi32(low, _mm_setzero_si128());
high = _mm_cmpgt_epi32(high, _mm_setzero_si128());
// stupid no-op casts
__m256 low2 = _mm256_castps128_ps256(_mm_castsi128_ps(low));
__m128 high2 = _mm_castsi128_ps(high);
// merge
__m256 total = _mm256_insertf128_ps(low2, high2, 1);
// convert to 0f or 1f
total = _mm256_and_ps(total, _mm256_set1_ps(1.0f));

GCC, . vbroadcastss set1 ( vpshufd), , ( , int ).

AVX2 :

__m256i x = _mm256_set1_epi32(mask); 
x = _mm256_and_si256(x, _mm256_set_epi32(128, 64, 32, 16, 8, 4, 2, 1));
x = _mm256_cmpgt_epi32(x, _mm256_setzero_si256());
x = _mm256_and_si256(x, _mm256_set1_epi32(0x3F800000));
return _mm256_castsi256_ps(x);
+1

? 2 ^ 8 , , , 2 ^ 4 = 16 .

, 16 "", , 4 . 2 * ( ).

, , ..

unsigned char myByte; // input byte (pattern to create floats)
float preprocessingArrays[16][4] = {
    { 0.0f, 0.0f, 0.0f, 0.0f }, // 0000
    // ...
    { 1.0f, 1.0f, 1.0f, 1.0f }  // 1111
};

float result[8];
std::memcpy(&result[0], &preprocessingArrays[myByte >> 4][0], 16);
std::memcpy(&result[4], &preprocessingArrays[myByte & 15][0], 16);
// 16 = platform-specific -> floats should be 32bits -> 4bytes * 4 floats = 16

, , memcpys, AND ( , , memcpy, 2 ^ 8).

C (++) , .., , . , memcpy 4 , . AVX, , 16 256- , , ( 16 ) , .

, , , , :)

+3
void byteToFloat(const uint8_t               byteIn, 
                       float *const restrict floatOut)
{
     floatOut[0]=(byteIn&0x01)?1.0f:0.0f;
     floatOut[1]=(byteIn&0x02)?1.0f:0.0f;
     floatOut[2]=(byteIn&0x04)?1.0f:0.0f;
     floatOut[3]=(byteIn&0x08)?1.0f:0.0f;
     floatOut[4]=(byteIn&0x10)?1.0f:0.0f;
     floatOut[5]=(byteIn&0x20)?1.0f:0.0f;
     floatOut[6]=(byteIn&0x40)?1.0f:0.0f;
     floatOut[7]=(byteIn&0x80)?1.0f:0.0f;
}

x86-64 Intel AMD, (cmove): .

http://en.wikipedia.org/wiki/Branch_predication

0

, @RippeR, .

:

switch(theChar){
 break; case   0: result[0] = 0; ... result[7] = 0;
 break; case   1: result[0] = 0; ... result[7] = 1;
 ...
 break; case 255: result[0] = 1; ... result[7] = 1;
}

This is verbose code, but you can get a preprocessor to help you write it.

The reason this could be faster is because the switch should turn into a jump table, and the moves should be pretty well optimized.

ADDED: if you are wondering how a preprocessor can help, here's what:

#define FOO(x,i) result[i] = !!((x) & (1<<(i)))
#define BAR(x) break; case x: FOO(x,0);FOO(x,1); ... FOO(x,7)
switch(theChar){
 BAR(0);
 BAR(1);
 ...
 BAR(255);
}
-2
source

All Articles