I know this is an old question, but since no one gave a solution using the IEEE float view, here is one.
// Use three unions instead of one to avoid pipeline stalls union { float f; uint32_t i; } t, u, v, w; tf = 32768.0f; float const b = 256.f / 255.f; for(int size = width * height; size > 0; --size) { ui = ti | bytepixel[0]; floatpixel[0] = (uf - tf) * b; vi = ti | bytepixel[1]; floatpixel[1] = (vf - tf) * b; wi = ti | bytepixel[2]; floatpixel[2] = (wf - tf) * b; floatpixel[3] = 1.0f; // A floatpixel += 4; bytepixel += 4; }
This is more than twice as fast as converting int to float on my computer (Core 2 Duo CPU).
Here is the SSE3 version of the above code, which makes 16 floats at a time. This requires that the bytepixel and floatpixel are 128 bit aligned and the total size is a multiple of 4. Note that the built-in int method for SSE3 floating conversions will not help here, as this will require additional multiplication. I think this is the shortest way to learn, but if your compiler is not smart enough, you can deploy and schedule things manually.
__m128i zero = _mm_set_epi32(0, 0, 0, 0); __m128i magic1 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000); __m128i magic2 = _mm_set_epi32(0x47004700, 0x47004700, 0x47004700, 0x47004700); __m128 magic3 = _mm_set_ps(32768.0f, 32768.0f, 32768.0f, 32768.0f); __m128 magic4 = _mm_set_ps(256.0f / 255.0f, 256.0f / 255.0f, 256.0f / 255.0f, 256.0f / 255.0f); for(int size = width * height / 4; size > 0; --size) { __m128i in = _mm_load_si128((__m128i *)bytepixel); in = _mm_or_si128(in, magic1); __m128i tmplo = _mm_unpacklo_epi8(in, zero); __m128i tmphi = _mm_unpackhi_epi8(in, zero); __m128 in1 = _mm_castsi128_ps(_mm_unpacklo_epi16(tmplo, magic2)); __m128 in2 = _mm_castsi128_ps(_mm_unpackhi_epi16(tmplo, magic2)); __m128 in3 = _mm_castsi128_ps(_mm_unpacklo_epi16(tmphi, magic2)); __m128 in4 = _mm_castsi128_ps(_mm_unpackhi_epi16(tmphi, magic2)); __m128 out1 = _mm_mul_ps(_mm_sub_ps(in1, magic3), magic4); __m128 out2 = _mm_mul_ps(_mm_sub_ps(in2, magic3), magic4); __m128 out3 = _mm_mul_ps(_mm_sub_ps(in3, magic3), magic4); __m128 out4 = _mm_mul_ps(_mm_sub_ps(in4, magic3), magic4); _mm_store_ps(floatpixel, out1); _mm_store_ps(floatpixel + 4, out2); _mm_store_ps(floatpixel + 8, out3); _mm_store_ps(floatpixel + 12, out4); floatpixel += 16; bytepixel += 16; }
Edit : Improve accuracy using (f + c/b) * b instead of f * b + c .
Edit : Add the version of SSE3.