I heard that transferring the matrix to multiplication will significantly speed up work due to the locality of the cache. So I wrote a simple C ++ program to test it with string ordering (compilation requires C ++ 11 and boost).
The results are astounding: 7.43 seconds versus 0.94 seconds. But I do not understand why it is accelerating. Indeed, in the second version (transposition first), the multiplication code accesses data using the stride-1 template and has a much better locality than the first. However, in order to transpose matrix B, it is also necessary to ignore the data and lead to a large number of cache misses. The overhead of memory allocation and data copying should also be illiterate. So, why does the second version significantly speed up the code?
#include <iostream>
#include <vector>
#include <boost/timer/timer.hpp>
#include <random>
std::vector<int> random_ints(size_t size)
{
std::vector<int> result;
result.reserve(size);
std::random_device rd;
std::mt19937 engine(rd());
std::uniform_int_distribution<int> dist(0, 100);
for (size_t i = 0; i < size; ++i)
result.push_back(dist(engine));
return result;
}
std::vector<int> matrix_multiply1(const std::vector<int>& A, const std::vector<int>& B, size_t m, size_t n, size_t p)
{
boost::timer::auto_cpu_timer t;
std::vector<int> C(m * p);
for (size_t i = 0; i < m; ++i)
{
for (size_t j = 0; j < p; ++j)
{
for (size_t k = 0; k < n; ++k)
{
C[i * m + j] += A[i * m + k] * B[k * n + j];
}
}
}
return C;
}
std::vector<int> matrix_multiply2(const std::vector<int>& A, const std::vector<int>& B, size_t m, size_t n, size_t p)
{
boost::timer::auto_cpu_timer t;
std::vector<int> C(m * p), B_transpose(n * p);
for (size_t i = 0; i < n; ++i)
{
for (size_t j = 0; j < p; ++j)
{
B_transpose[i + j * p] = B[i * n + j];
}
}
for (size_t i = 0; i < m; ++i)
{
for (size_t j = 0; j < p; ++j)
{
for (size_t k = 0; k < n; ++k)
{
C[i * m + j] += A[i * m + k] * B_transpose[k + j * p];
}
}
}
return C;
}
int main()
{
const size_t size = 1 << 10;
auto A = random_ints(size * size);
auto C = matrix_multiply1(A, A, size, size, size);
std::cout << C.front() << ' ' << C.back() << std::endl;
C = matrix_multiply2(A, A, size, size, size);
std::cout << C.front() << ' ' << C.back() << std::endl;
return 0;
}
source
share