There is no need to mystify how a high-performance implementation of a matrix-matrix product can be achieved. In fact, we need more people who are aware of this to face future challenges in high-performance computing. To enter this topic, read BLIS: The framework for quickly creating BLAS functions is a good starting point.
So, to demystify and answer the question (How to write a matrix matrix product that can compete with Eigen), I expanded the code sent by ggael to a total of 400 lines. I just tested it on an AVX machine (Intel (R) Core (TM) i5-3470 CPU @ 3.20GHz). Here are some results:
g++-5.3 -O3 -DNDEBUG -std=c++11 -mavx -m64 -I ../eigen.3.2.8/ gemm.cc -lrt lehn@heim:~/work/test_eigen$ ./a.out 500 Time taken by Eigen is: 0.0190425 Time taken by for-loop is: 0.0121688 lehn@heim:~/work/test_eigen$ ./a.out 1000 Time taken by Eigen is: 0.147991 Time taken by for-loop is: 0.0959097 lehn@heim:~/work/test_eigen$ ./a.out 1500 Time taken by Eigen is: 0.492858 Time taken by for-loop is: 0.322442 lehn@heim:~/work/test_eigen$ ./a.out 5000 Time taken by Eigen is: 18.3666 Time taken by for-loop is: 12.1023
If you have FMA, you can compile
g++-5.3 -O3 -DNDEBUG -std=c++11 -mfma -m64 -I ../eigen.3.2.8/ -DHAVE_FMA gemm.cc -lrt
If you also want multithreading with openMP to compile with -fopenmp
Here's the complete code based on the ideas of BLIS paper. It is self-contained, except that it needs the full Eigen source files, as ggael already noted:
#include<iostream> #include<Eigen/Dense> #include<bench/BenchTimer.h> #if defined(_OPENMP) #include <omp.h> #endif //-- malloc with alignment -------------------------------------------------------- void * malloc_(std::size_t alignment, std::size_t size) { alignment = std::max(alignment, alignof(void *)); size += alignment; void *ptr = std::malloc(size); void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1)); void **vp = (void**) ptr2 - 1; *vp = ptr; return ptr2; } void free_(void *ptr) { std::free(*((void**)ptr-1)); } //-- Config -------------------------------------------------------------------- // SIMD-Register width in bits // SSE: 128 // AVX/FMA: 256 // AVX-512: 512 #ifndef SIMD_REGISTER_WIDTH #define SIMD_REGISTER_WIDTH 256 #endif #ifdef HAVE_FMA # ifndef BS_D_MR # define BS_D_MR 4 # endif # ifndef BS_D_NR # define BS_D_NR 12 # endif # ifndef BS_D_MC # define BS_D_MC 256 # endif # ifndef BS_D_KC # define BS_D_KC 512 # endif # ifndef BS_D_NC # define BS_D_NC 4092 # endif #endif #ifndef BS_D_MR #define BS_D_MR 4 #endif #ifndef BS_D_NR #define BS_D_NR 8 #endif #ifndef BS_D_MC #define BS_D_MC 256 #endif #ifndef BS_D_KC #define BS_D_KC 256 #endif #ifndef BS_D_NC #define BS_D_NC 4096 #endif template <typename T> struct BlockSize { static constexpr int MC = 64; static constexpr int KC = 64; static constexpr int NC = 256; static constexpr int MR = 8; static constexpr int NR = 8; static constexpr int rwidth = 0; static constexpr int align = alignof(T); static constexpr int vlen = 0; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<double> { static constexpr int MC = BS_D_MC; static constexpr int KC = BS_D_KC; static constexpr int NC = BS_D_NC; static constexpr int MR = BS_D_MR; static constexpr int NR = BS_D_NR; static constexpr int rwidth = SIMD_REGISTER_WIDTH; static constexpr int align = rwidth / 8; static constexpr int vlen = rwidth / (8*sizeof(double)); static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane."); }; //-- aux routines -------------------------------------------------------------- template <typename Index, typename Alpha, typename TX, typename TY> void geaxpy(Index m, Index n, const Alpha &alpha, const TX *X, Index incRowX, Index incColX, TY *Y, Index incRowY, Index incColY) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } template <typename Index, typename Alpha, typename TX> void gescal(Index m, Index n, const Alpha &alpha, TX *X, Index incRowX, Index incColX) { if (alpha!=Alpha(0)) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] *= alpha; } } } else { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] = Alpha(0); } } } } //-- Micro Kernel -------------------------------------------------------------- template <typename Index, typename T> typename std::enable_if<BlockSize<T>::vlen != 0, void>::type ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8))); static constexpr Index vlen = BlockSize<T>::vlen; static constexpr Index MR = BlockSize<T>::MR; static constexpr Index NR = BlockSize<T>::NR/vlen; A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align); B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align); vx P[MR*NR] = {}; for (Index l=0; l<kc; ++l) { const vx *b = (const vx *)B; for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { P[i*NR+j] += A[i]*b[j]; } } A += MR; B += vlen*NR; } if (alpha!=T(1)) { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { P[i*NR+j] *= alpha; } } } if (beta!=T(0)) { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { const T *p = (const T *) &P[i*NR+j]; for (Index j1=0; j1<vlen; ++j1) { C[i*incRowC+(j*vlen+j1)*incColC] *= beta; C[i*incRowC+(j*vlen+j1)*incColC] += p[j1]; } } } } else { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { const T *p = (const T *) &P[i*NR+j]; for (Index j1=0; j1<vlen; ++j1) { C[i*incRowC+(j*vlen+j1)*incColC] = p[j1]; } } } } } //-- Macro Kernel -------------------------------------------------------------- template <typename Index, typename T, typename Beta, typename TC> void mgemm(Index mc, Index nc, Index kc, T alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; const Index mp = (mc+MR-1) / MR; const Index np = (nc+NR-1) / NR; const Index mr_ = mc % MR; const Index nr_ = nc % NR; T C_[MR*NR]; #pragma omp parallel for for (Index j=0; j<np; ++j) { const Index nr = (j!=np-1 || nr_==0) ? NR : nr_; for (Index i=0; i<mp; ++i) { const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_; if (mr==MR && nr==NR) { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } else { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], T(0), C_, Index(1), MR); gescal(mr, nr, beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); geaxpy(mr, nr, T(1), C_, Index(1), MR, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } } } } //-- Packing blocks ------------------------------------------------------------ template <typename Index, typename TA, typename T> void pack_A(Index mc, Index kc, const TA *A, Index incRowA, Index incColA, T *p) { Index MR = BlockSize<T>::MR; Index mp = (mc+MR-1) / MR; for (Index j=0; j<kc; ++j) { for (Index l=0; l<mp; ++l) { for (Index i0=0; i0<MR; ++i0) { Index i = l*MR + i0; Index nu = l*MR*kc + j*MR + i0; p[nu] = (i<mc) ? A[i*incRowA+j*incColA] : T(0); } } } } template <typename Index, typename TB, typename T> void pack_B(Index kc, Index nc, const TB *B, Index incRowB, Index incColB, T *p) { Index NR = BlockSize<T>::NR; Index np = (nc+NR-1) / NR; for (Index l=0; l<np; ++l) { for (Index j0=0; j0<NR; ++j0) { for (Index i=0; i<kc; ++i) { Index j = l*NR+j0; Index nu = l*NR*kc + i*NR + j0; p[nu] = (j<nc) ? B[i*incRowB+j*incColB] : T(0); } } } } //-- Frame routine ------------------------------------------------------------- template <typename Index, typename Alpha, typename TA, typename TB, typename Beta, typename TC> void gemm(Index m, Index n, Index k, Alpha alpha, const TA *A, Index incRowA, Index incColA, const TB *B, Index incRowB, Index incColB, Beta beta, TC *C, Index incRowC, Index incColC) { typedef typename std::common_type<Alpha, TA, TB>::type T; const Index MC = BlockSize<T>::MC; const Index NC = BlockSize<T>::NC; const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; const Index KC = BlockSize<T>::KC; const Index mb = (m+MC-1) / MC; const Index nb = (n+NC-1) / NC; const Index kb = (k+KC-1) / KC; const Index mc_ = m % MC; const Index nc_ = n % NC; const Index kc_ = k % KC; T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR)); T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR)); if (alpha==Alpha(0) || k==0) { gescal(m, n, beta, C, incRowC, incColC); return; } for (Index j=0; j<nb; ++j) { Index nc = (j!=nb-1 || nc_==0) ? NC : nc_; for (Index l=0; l<kb; ++l) { Index kc = (l!=kb-1 || kc_==0) ? KC : kc_; Beta beta_ = (l==0) ? beta : Beta(1); pack_B(kc, nc, &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB, B_); for (Index i=0; i<mb; ++i) { Index mc = (i!=mb-1 || mc_==0) ? MC : mc_; pack_A(mc, kc, &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA, A_); mgemm(mc, nc, kc, T(alpha), A_, B_, beta_, &C[i*MC*incRowC+j*NC*incColC], incRowC, incColC); } } } free_(A_); free_(B_); } //------------------------------------------------------------------------------ void myprod(double *c, const double* a, const double* b, int N) { gemm(N, N, N, 1.0, a, 1, N, b, 1, N, 0.0, c, 1, N); } int main(int argc, char* argv[]) { int N = atoi(argv[1]); int tries = 4; int rep = std::max<int>(1,10000000/N/N/N); Eigen::MatrixXd a_E = Eigen::MatrixXd::Random(N,N); Eigen::MatrixXd b_E = Eigen::MatrixXd::Random(N,N); Eigen::MatrixXd c_E(N,N); Eigen::BenchTimer t1, t2; BENCH(t1, tries, rep, c_E.noalias() = a_E*b_E ); BENCH(t2, tries, rep, myprod(c_E.data(), a_E.data(), b_E.data(), N)); std::cout << "Time taken by Eigen is: " << t1.best() << "\n"; std::cout << "Time taken by for-loop is: " << t2.best() << "\n\n"; }