The most efficient way to loop through your own matrix

I create some functions to do things like the "divided sum" of a negative and a positive number, cahan, pairwise and so on, where it doesn’t matter which order I take from the matrix, for example:

template <typename T, int R, int C> inline T sum(const Eigen::Matrix<T,R,C>& xs) { T sumP(0); T sumN(0); for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i) for (size_t j = 0; j < nCols; ++j) { if (xs(i,j)>0) sumP += xs(i,j); else if (xs(i,j)<0) //ignore 0 elements: improvement for sparse matrices I think sumN += xs(i,j); } return sumP+sumN; } 

Now, I would like to make this as efficient as possible, so my question is: would it be better to scroll through each column of each row, as indicated above, or do the opposite, as shown below:

 for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i) for (size_t j = 0; j < nRows; ++j) 

(I suppose it depends on the fact that the matrix elements are allocated in memory, but I could not find it in the Eigen manual).

Also, are there other alternative ways to use iterators (do they exist in Eigen?) That might be a little faster?

+9
source share
4 answers

Eigen highlights matrices in column order (Fortran) by default ( documentation ).

The fastest way to iterate over the matrix is ​​in storage order, doing it the wrong way will increase the number of misses in the cache (which, if your matrix does not match L1, will dominate the calculation time, so read the calculation time) with a cacheline / elemsize coefficient (maybe 64/8 = 8).

If your matrix fits in the L1 cache, it will not matter, but a good compiler should be able to vectorize a loop that with AVX enabled (on the new brilliant i7 core) can give you 4x speedup. (256 bit / 64 bit).

Finally, don't expect any of the Eigen built-in functions to give you speedup (I don't think there are iterators in all cases, but I could be wrong), they are just going to give you the same (very simple) code.

TL; DR: reverse iteration order, you need to quickly change the row index.

+7
source

I did some tests to check which path is faster, I got the following results (in seconds):

 12 30 3 6 23 3 

The first line iterates as suggested by @jleahy. The second line iterates as I did in my code in the question (reverse order @jleahy). The third line PlainObjectBase::data() through PlainObjectBase::data() like this for (int i = 0; i < matrixObject.size(); i++) . The remaining 3 lines repeat the same as above, but with a temporary sentence @ lucas92

I also did the same tests, but using the / if else substitution. * / for / else / (no special treatment for a sparse matrix), and I got the following (in seconds):

 10 27 3 6 24 2 

Running the tests again gave me very similar results. I used g++ 4.7.3 with -O3 . The code:

 #include <ctime> #include <iostream> #include <Eigen/Dense> using namespace std; template <typename T, int R, int C> inline T sum_kahan1(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i) for (size_t j = 0; j < nRows; ++j) { if (xs(j,i)>0) { yP = xs(j,i) - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else if (xs(j,i)<0) { yN = xs(j,i) - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan2(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i) for (size_t j = 0; j < nCols; ++j) { if (xs(i,j)>0) { yP = xs(i,j) - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else if (xs(i,j)<0) { yN = xs(i,j) - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan3(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, size = xs.size(); i < size; i++) { if ((*(xs.data() + i))>0) { yP = (*(xs.data() + i)) - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else if ((*(xs.data() + i))<0) { yN = (*(xs.data() + i)) - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan1t(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i) for (size_t j = 0; j < nRows; ++j) { T temporary = xs(j,i); if (temporary>0) { yP = temporary - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else if (temporary<0) { yN = temporary - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan2t(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i) for (size_t j = 0; j < nCols; ++j) { T temporary = xs(i,j); if (temporary>0) { yP = temporary - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else if (temporary<0) { yN = temporary - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan3t(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, size = xs.size(); i < size; i++) { T temporary = (*(xs.data() + i)); if (temporary>0) { yP = temporary - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else if (temporary<0) { yN = temporary - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan1e(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i) for (size_t j = 0; j < nRows; ++j) { if (xs(j,i)>0) { yP = xs(j,i) - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else { yN = xs(j,i) - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan2e(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i) for (size_t j = 0; j < nCols; ++j) { if (xs(i,j)>0) { yP = xs(i,j) - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else { yN = xs(i,j) - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan3e(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, size = xs.size(); i < size; i++) { if ((*(xs.data() + i))>0) { yP = (*(xs.data() + i)) - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else { yN = (*(xs.data() + i)) - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan1te(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i) for (size_t j = 0; j < nRows; ++j) { T temporary = xs(j,i); if (temporary>0) { yP = temporary - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else { yN = temporary - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan2te(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i) for (size_t j = 0; j < nCols; ++j) { T temporary = xs(i,j); if (temporary>0) { yP = temporary - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else { yN = temporary - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } template <typename T, int R, int C> inline T sum_kahan3te(const Eigen::Matrix<T,R,C>& xs) { if (xs.size() == 0) return 0; T sumP(0); T sumN(0); T tP(0); T tN(0); T cP(0); T cN(0); T yP(0); T yN(0); for (size_t i = 0, size = xs.size(); i < size; i++) { T temporary = (*(xs.data() + i)); if (temporary>0) { yP = temporary - cP; tP = sumP + yP; cP = (tP - sumP) - yP; sumP = tP; } else { yN = temporary - cN; tN = sumN + yN; cN = (tN - sumN) - yN; sumN = tN; } } return sumP+sumN; } int main() { Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> test = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>::Random(10000,10000); cout << "start" << endl; int now; now = time(0); sum_kahan1(test); cout << time(0) - now << endl; now = time(0); sum_kahan2(test); cout << time(0) - now << endl; now = time(0); sum_kahan3(test); cout << time(0) - now << endl; now = time(0); sum_kahan1t(test); cout << time(0) - now << endl; now = time(0); sum_kahan2t(test); cout << time(0) - now << endl; now = time(0); sum_kahan3t(test); cout << time(0) - now << endl; now = time(0); sum_kahan1e(test); cout << time(0) - now << endl; now = time(0); sum_kahan2e(test); cout << time(0) - now << endl; now = time(0); sum_kahan3e(test); cout << time(0) - now << endl; now = time(0); sum_kahan1te(test); cout << time(0) - now << endl; now = time(0); sum_kahan2te(test); cout << time(0) - now << endl; now = time(0); sum_kahan3te(test); cout << time(0) - now << endl; return 0; } 
+17
source

I noticed that the code is equivalent to the sum of all the entries in the matrix, i.e. you could just do this:

 return xs.sum(); 

I would suggest that it would be better, because it is just one pass, and, in addition, Eigen must “know” how to arrange passages for optimal performance.

If, however, you want to save two passes, you can express this using coefficient reduction mechanisms, for example:

 return (xs.array() > 0).select(xs, 0).sum() + (xs.array() < 0).select(xs, 0).sum(); 

which uses a selection of boolean coefficients to select positive and negative entries. I don’t know if it will surpass manual convolutions, but theoretically coding it allows Eigen (and the compiler) to learn more about your intentions and, possibly, to improve the results.

+3
source

Try saving xs (i, j) inside a temporary variable inside a loop so that you only call the function once.

+1
source

All Articles