(Vec4 x Mat4x4) using SIMD and enhancements

Question

(Vec4 x Mat4x4) using SIMD and enhancements

I am writing a complex simulation program, and she believes that the longest routine is multiplying a four-vector (float4) with a 4x4 matrix. I need to run this program on several computers that are more or less old. This is why I tried to test the SIMD capabilities of such operations in the following code:

//#include <xmmintrin.h> // SSE //#include <pmmintrin.h> // SSE3 //#include <nmmintrin.h> // SSE4.2 #include <immintrin.h> // AVX #include <iostream> #include <ctime> #include <string> using namespace std; // 4-vector. typedef struct { float x; float y; float z; float w; }float4; // typedef to simplify the pointer of function notation. typedef void(*Function)(float4&,const float4*,const float4&); float dot( const float4& in_A, const float4& in_x ) { return in_A.x*in_x.x + in_A.y*in_x.y + in_A.z*in_x.z + in_A.w*in_x.w; // 7 FLOPS } void A_times_x( float4& out_y, const float4* in_A, const float4& in_x ) { out_y.x = dot(in_A[0], in_x); // 7 FLOPS out_y.y = dot(in_A[1], in_x); // 7 FLOPS out_y.z = dot(in_A[2], in_x); // 7 FLOPS out_y.w = dot(in_A[3], in_x); // 7 FLOPS } void A_times_x_SSE( float4& out_y, const float4* in_A, const float4& in_x ) { // Load matrix A and vector x into SSE registers __m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS __m128 A0 = _mm_load_ps((const float*)(in_A + 0)); __m128 A1 = _mm_load_ps((const float*)(in_A + 1)); __m128 A2 = _mm_load_ps((const float*)(in_A + 2)); __m128 A3 = _mm_load_ps((const float*)(in_A + 3)); // Transpose the matrix and re-order the vector. _MM_TRANSPOSE4_PS( A0,A1,A2,A3 ); __m128 u1 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,0,0,0)); __m128 u2 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,1,1,1)); __m128 u3 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,2,2,2)); __m128 u4 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(3,3,3,3)); // Multiply each matrix row with the vector x __m128 m0 = _mm_mul_ps(A0, u1); // 4 FLOPS __m128 m1 = _mm_mul_ps(A1, u2); // 4 FLOPS __m128 m2 = _mm_mul_ps(A2, u3); // 4 FLOPS __m128 m3 = _mm_mul_ps(A3, u4); // 4 FLOPS // Using HADD, we add four floats at a time __m128 sum_01 = _mm_add_ps(m0, m1); // 4 FLOPS __m128 sum_23 = _mm_add_ps(m2, m3); // 4 FLOPS __m128 result = _mm_add_ps(sum_01, sum_23); // 4 FLOPS // Finally, store the result _mm_store_ps((float*)&out_y, result); } void A_times_x_SSE3( float4& out_y, const float4* in_A, const float4& in_x ) { // Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar. // Load matrix A and vector x into SSE registers __m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS __m128 A0 = _mm_load_ps((const float*)(in_A + 0)); __m128 A1 = _mm_load_ps((const float*)(in_A + 1)); __m128 A2 = _mm_load_ps((const float*)(in_A + 2)); __m128 A3 = _mm_load_ps((const float*)(in_A + 3)); // Multiply each matrix row with the vector x __m128 m0 = _mm_mul_ps(A0, x); // 4 FLOPS __m128 m1 = _mm_mul_ps(A1, x); // 4 FLOPS __m128 m2 = _mm_mul_ps(A2, x); // 4 FLOPS __m128 m3 = _mm_mul_ps(A3, x); // 4 FLOPS // Using HADD, we add four floats at a time __m128 sum_01 = _mm_hadd_ps(m0, m1); // 4 FLOPS __m128 sum_23 = _mm_hadd_ps(m2, m3); // 4 FLOPS __m128 result = _mm_hadd_ps(sum_01, sum_23); // 4 FLOPS // Finally, store the result _mm_store_ps((float*)&out_y, result); } void A_times_x_SSE4( float4& out_y, const float4* in_A, const float4& in_x ) // 28 FLOPS { // Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar. // Load matrix A and vector x into SSE registers __m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS __m128 A0 = _mm_load_ps((const float*)(in_A + 0)); __m128 A1 = _mm_load_ps((const float*)(in_A + 1)); __m128 A2 = _mm_load_ps((const float*)(in_A + 2)); __m128 A3 = _mm_load_ps((const float*)(in_A + 3)); // Multiply each matrix row with the vector x __m128 m0 = _mm_dp_ps(A0, x, 0xFF); // 4 FLOPS __m128 m1 = _mm_dp_ps(A1, x, 0xFF); // 4 FLOPS __m128 m2 = _mm_dp_ps(A2, x, 0xFF); // 4 FLOPS __m128 m3 = _mm_dp_ps(A3, x, 0xFF); // 4 FLOPS // Using HADD, we add four floats at a time __m128 mov_01 = _mm_movelh_ps(m0, m1); // 4 FLOPS __m128 mov_23 = _mm_movelh_ps(m2, m3); // 4 FLOPS __m128 result = _mm_shuffle_ps(mov_01, mov_23, _MM_SHUFFLE(2, 0, 2, 0)); // 4 FLOPS // Finally, store the result _mm_store_ps((float*)&out_y, result); } void A_times_x_AVX( float4& out_y, const float4* in_A, const float4& in_x ) { // Load matrix A and vector x into SSE registers __m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS __m256 xx = _mm256_castps128_ps256(x); xx = _mm256_insertf128_ps(xx,x,1); __m256 A0 = _mm256_load_ps((const float*)(in_A + 0)); __m256 A2 = _mm256_load_ps((const float*)(in_A + 2)); // Multiply each matrix row with the vector x __m256 m0 = _mm256_mul_ps(A0, xx); // 4 FLOPS __m256 m2 = _mm256_mul_ps(A2, xx); // 4 FLOPS // Using HADD, we add four floats at a time __m256 sum_00 = _mm256_hadd_ps(m0, m2); // 4 FLOPS /*__m128 sum_10 = _mm256_extractf128_ps(sum_00,0); __m128 sum_01 = _mm256_extractf128_ps(sum_00,1); __m128 result = _mm_hadd_ps(sum_10, sum_01); // 4 FLOPS // Finally, store the result _mm_store_ps((float*)&out_y, result);*/ // Finally, store the result (no temp variable: direct HADD, this avoid to copy from ALU128 to ALU256) _mm_store_ps((float*)&out_y, _mm_hadd_ps(_mm256_extractf128_ps(sum_00,0), _mm256_extractf128_ps(sum_00,1))); } void test_function ( Function f, string simd, unsigned int imax ) { float4 Y; float4 X1 = {0.5,1,0.2,0.7}; float4 X2 = {0.7,1,0.2,0.5}; float4 X3 = {0.5,0.2,1,0.7}; float4 X4 = {1,0.7,0.2,0.5}; float4 A[4] = {{0.5,1,0.2,0.7}, {0.6,0.4,0.1,0.8}, {0.3,0.8,0.2,0.5}, {1,0.4,0.6,0.9}}; clock_t tstart = clock(); for( unsigned int i=0 ; i<imax ; i++ ) for( unsigned long int j=0 ; j<250000000 ; j++ ) // Avoid for loop over long long, it is 2 times slower ! { // Function pointer give a real call, whether the direct // call is inlined and thus results are overestimated. f( Y,A,X1 ); f( Y,A,X2 ); f( Y,A,X3 ); f( Y,A,X4 ); } clock_t tend = clock(); double diff = static_cast<double>(tend - tstart) * 1e-3; cout << "Time (" << simd << ") = " << diff << " s" << endl; cout << "Nops (" << simd << ") = " << (double) imax << ".10^9" << endl; cout << "Power (" << simd << ") = " << (double) imax * 28. / diff << " GFLOPS" << endl; // 28 FLOPS for std. cout << endl; } int main ( int argc, char *argv[] ) { test_function ( &A_times_x ,"std" , 1 ); test_function ( &A_times_x_SSE ,"SSE" , 2 ); test_function ( &A_times_x_SSE3,"SSE3", 3 ); test_function ( &A_times_x_SSE4,"SSE4", 1 ); test_function ( &A_times_x_AVX ,"AVX" , 3 ); return 0; }

I have some problems with improvements for such a problem. When I run the code, I get the following results (Intel Core i5 4670K, 3.4GHz, Haswell, Codeblock + MinGW compiler using -O2 -march = corei7-avx):

 Time (std) = 6.287 s Nops (std) = 1.10^9 Power (std) = 4.45363 GFLOPS Time (SSE) = 6.661 s Nops (SSE) = 2.10^9 Power (SSE) = 8.40715 GFLOPS Time (SSE3) = 8.361 s Nops (SSE3) = 3.10^9 Power (SSE3) = 10.0466 GFLOPS Time (SSE4) = 6.131 s Nops (SSE4) = 1.10^9 Power (SSE4) = 4.56695 GFLOPS Time (AVX) = 8.767 s Nops (AVX) = 3.10^9 Power (AVX) = 9.58138 GFLOPS

My questions are as follows:

Is it possible to improve performance / speed up work? Must be x4 (maximum) for SSE and x8 for AVX.
Why is AVX not faster than SSE3?

For those who say: “stop using your materials, use the Intel Math core library”, I reply: “I wouldn’t do this because I need a small executable and I need to use SIMD for this particular case, not another place "; -)

+8

c ++ matrix avx simd sse3

Asohan Jun 26 '15 at 15:01

source share

1 answer

Z boson · Accepted Answer · 2015-06-27T22:12:24+0000

Is it possible to improve performance / speed up work? It should be x4 (maximum) for SSE and x8 for AVX.

Yes, I have explained this in detail in the efficient-4x4-matrix-vector-multipication-with-sse-horizontal-add-and-dot-product .

An effective method for multiplying a 4x4 M matrix by a column vector u giving v = M u :

 v = u1*col1 + u2*col2 + u3*col3 + u4*col4.

this requires you to keep the column vectors. For example, suppose you have the following 4x4 A matrix:

  0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

then you save it as

 0 4 8 c 1 5 9 d 2 6 ae 3 7 bf

conversely, if you want the vector uT times matrix M row to give vT = uT*M , then you want

 vT = uT1*row1 + uT2*row2 + uT3*row3 + uT4*row4.

in which case you should pack the rows, not the columns.

So, to optimize your code in your A_times_x_SSE function A_times_x_SSE comment out the line

  _MM_TRANSPOSE4_PS( A0,A1,A2,A3 );

and this function will be faster than your other functions using horizontal operations.

Horizontal SIMD operations are ineffective. They are in some, because they are not SIMD, because they are divided into scalar microoperations, therefore they are not parallel. They are useful only when it is inconvenient to collect your data in a friendly form for SIMD. For example, if you cannot store M columns and have only rows.

Why is AVX not faster than SSE3?

To do this with AVX, you need to work with two 4x4 matrices at the same time, and also pack the matrices so that they are AVX friendly. Suppose now that in addition to the above matrix A , you have another matrix B :

 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31

The best way for package A and B for AVX is

 col1A col1B col2A col2B col3A col3B col4A col4B 0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31

Suppose you have two vectors u = {0,1,2,3} and v = {4,5,6,7) , and you want y = Au and z = Bv , and then using AVX:

 c1 = col1A col1B = {0 4 8 12 16 20 24 28} = _mm256_load_ps c2 = col2A col2B = {1 5 9 13 17 21 25 29} c3 = col3A col3B = {2 6 10 14 18 22 26 30} c4 = col4A col4B = {3 7 11 15 19 23 27 31} broad1 = {0,0,0,0,4,4,4,4} broad2 = {1,1,1,1,5,5,5,5} broad3 = {2,2,2,2,6,6,6,6} broad4 = {3,3,3,3,7,7,7,7} w = broad1*c1 + broad2*c2 + broad3*c + broad4*c4; //w = {y1, y2, y3, y4, z1, z2, z3, z4};

Thus, the resulting 8-wide vector w contains 4-vectors y and z . This is the most efficient method with AVX. If you have fixed matrices and variable vectors, you can work in a loop, then packing before the loop at run time will be negligible for a large loop. If you know that matrices are fixed at compile time, you can pack them at compile time.

(Vec4 x Mat4x4) using SIMD and enhancements

More articles: