I do not know anyone, and I always thought that it would be useful to have.
, ( , ), ++ . Templating the functions , , . , -
template < typename Real, unsigned int l, unsigned int m, unsigned int n >
__device__ __host__
void matmul(const Real *a,
const Real *b,
Real *c)
{
for(int i=0; i<l; i++) {
for(int j=0; j<n; j++) {
Real dotprod = Real(0);
for(int k=0; k<m; k++) {
dotprod += a[idx2c(i,k,l)] * b[idx2c(k,j,m)];
}
c[idx2c(i,j,l)] = dotprod;
}
}
}
, (2x2, 3x3, 4x4, 8x8, 9x9), , , , . CUDA , , .