I implement bilinear interpolation in a hard loop and try to optimize it with SSE, but I get zero acceleration from it.
Here is the code, a version other than SIMD, uses a simple vector structure, which can be defined as struct Vec3f { float x, y, z; } struct Vec3f { float x, y, z; } with implemented multiplication and addition operators:
#ifdef USE_SIMD const Color c11 = pixelCache[y1 * size.x + x1]; const Color c12 = pixelCache[y2 * size.x + x1]; const Color c22 = pixelCache[y2 * size.x + x2]; const Color c21 = pixelCache[y1 * size.x + x2]; __declspec(align(16)) float mc11[4] = { 1.0, c11.GetB(), c11.GetG(), c11.GetR() }; __declspec(align(16)) float mc12[4] = { 1.0, c12.GetB(), c12.GetG(), c12.GetR() }; __declspec(align(16)) float mc22[4] = { 1.0, c22.GetB(), c22.GetG(), c22.GetR() }; __declspec(align(16)) float mc21[4] = { 1.0, c21.GetB(), c21.GetG(), c21.GetR() }; // scalars in vector form for SSE const float s11 = (x2-x)*(y2-y); const float s12 = (x2-x)*(y-y1); const float s22 = (x-x1)*(y-y1); const float s21 = (x-x1)*(y2-y); __declspec(align(16)) float ms11[4] = {1.0, s11, s11, s11}; __declspec(align(16)) float ms12[4] = {1.0, s12, s12, s12}; __declspec(align(16)) float ms22[4] = {1.0, s22, s22, s22}; __declspec(align(16)) float ms21[4] = {1.0, s21, s21, s21}; __asm { movaps xmm0, mc11 movaps xmm1, mc12 movaps xmm2, mc22 movaps xmm3, mc21 movaps xmm4, ms11 movaps xmm5, ms12 movaps xmm6, ms22 movaps xmm7, ms21 mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 mulps xmm3, xmm7 addps xmm0, xmm1 addps xmm0, xmm2 addps xmm0, xmm3 movaps mc11, xmm0 }
Rearranging the asm code to reuse registers (eventually with three xmm registers) had no effect. I also tried using intrinsics:
// perform bilinear interpolation const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]); const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]); const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]); const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]); // scalars in vector form for SSE const float s11 = (x2-x)*(y2-y); const float s12 = (x2-x)*(y-y1); const float s22 = (x-x1)*(y-y1); const float s21 = (x-x1)*(y2-y); __m128 mc11 = _mm_set_ps(1.f, c11.b, c11.g, c11.r); __m128 mc12 = _mm_set_ps(1.f, c12.b, c12.g, c12.r); __m128 mc22 = _mm_set_ps(1.f, c22.b, c22.g, c22.r); __m128 mc21 = _mm_set_ps(1.f, c21.b, c21.g, c21.r); __m128 ms11 = _mm_set_ps(1.f, s11, s11, s11); __m128 ms12 = _mm_set_ps(1.f, s12, s12, s12); __m128 ms22 = _mm_set_ps(1.f, s22, s22, s22); __m128 ms21 = _mm_set_ps(1.f, s21, s21, s21); mc11 = _mm_mul_ps(mc11, ms11); mc12 = _mm_mul_ps(mc12, ms12); mc22 = _mm_mul_ps(mc22, ms22); mc21 = _mm_mul_ps(mc21, ms21); mc11 = _mm_add_ps(mc11, mc12); mc11 = _mm_add_ps(mc11, mc22); mc11 = _mm_add_ps(mc11, mc21); Vec3f colour; _mm_storeu_ps(colour.array, mc11);
And to no avail. Am I missing something, or is it impossible to get extra speed here?