Why is the calculation time of matrix multiplication not constant?

I made a program that does matrix multiplication (without optimization).

for(i=0; i<a_r; i++) for(j=0;j<b_c; j++) for(k=0;k<a_c; k++) c[i][j]=c[i][j]+a[i][k]*b[k][j]; 

Depending on how I allocate the memory, the calculation time is different.

It is important to note three points:

  • I record only the calculation time, not the allocation time (plus the distribution time is negligible compared to the calculation time).
  • I initialize matrices with random numbers before calculating. I use huge matrices (2500 int * 2500 int). I choose this size to use RAM, but not a swap.
  • This phenomenon disappears if it does not use RAM.

I am testing this program with three different distributions:

Test 1 : global distribution

 Int a[2500][2500], b[2500][2500], c[2500][2500]; Main {matrix multiplication a*b=c} 

The calculation time is fairly constant (about 41 s).

Test 2 : dynamic allocation (arrays)

 Main{ int **a,**b,**c; a=(int **) malloc(sizeof(int)*2500); for( i=0;i<a_c; i++) a[i]=(int *) malloc(sizeof(int)*2500); โ€ฆ matrix multiplication a*b=c } 

When I execute the program several times in raw , I get this runtime: 260 seconds, 180, 110, 110 ... If I wait about 5 seconds and run the program again, I get the same results.

Test 3 : dynamic allocation (rows)

 Main{ int *a, *b, *c; a=(int *) malloc(sizeof(int)*2500*2500); โ€ฆ (same for b and c) matrix multiplication a*b=c } 

The calculation time is quite constant (about 44 s).

I think test 2 is less efficient because of how the data is stored in memory. How to explain in this article an article (today the site is missing) or in this question . Some way to go through memory is more efficient, and as a result, some way of allocating memory gives you a more efficient program.

But (in test 2), I donโ€™t know why the program is faster over time. Does anyone have an idea to explain this phenomenon? Thanks in advance.

PS I did these tests on Intel Xeon E5-2625 with CentOS 6.3 and Linux kernel 3.11.1. EDIT: Frequency scaling is disabled on my computer. The frequency is constant.

Here is the code:

 #include <stdlib.h> #include <stdio.h> #include <time.h> #include <float.h> #include <sys/time.h> #include <sys/mman.h> #include <sys/resource.h> /**************************************************** * Random generate a random int between _Min and _Max */ int Random(int _iMin,int _iMax) { return (_iMin + (rand()%(_iMax-_iMin+1))); } /**************************************************** * Return the struct of getrusage. */ struct rusage give_ru() { struct rusage ru; getrusage(RUSAGE_SELF, &ru); //getrusage(RUSAGE_CHILDREN, &ru); return ru; } /**************************************************** * Do a matrix multiplication a*b=c * This program isn't optimized. */ int main() { /* * Initialize variables and array sizes. */ int **a,**b,**c; printf("\nenter Taille de la matrice : 2500"); int a_r=2500,a_c=2500,b_r=2500,b_c=2500; clock_t start,end; struct rusage start1,end1; /* variables to store time difference between start of matrix multiplication and end of multiplication. */ double dif; /*variable to calculate the time difference between the parallelization */ int i,j,k; if(a_c!=b_r ) { printf("\ncan not multiply"); goto again; } /* * Memory allocation. */ start =clock(); a=(int **) malloc(sizeof(int *)*a_r); if (a == NULL) { printf("Allocation of matrix a failed \n"); exit(0); } for( i=0;i<a_c; i++) { a[i]=(int *) malloc(sizeof(int)*a_c); if (a[i] == NULL) { printf("Allocation of matrix a[%d] failed \n",i); exit(0); } } b=(int **) malloc(sizeof(int *)*b_r); if (b == NULL) { printf("Allocation of matrix b failed \n"); exit(0); } for( i=0;i<b_c; i++) { b[i]=(int *) malloc(sizeof(int)*b_c); if (b[i] == NULL) { printf("Allocation of matrix b[%d] failed \n",i); exit(0); } } c=(int **) malloc(sizeof(int *)*a_r); if (c == NULL) { printf("Allocation of matrix c failed \n"); exit(0); } for( i=0;i< b_c; i++) { c[i]=(int *) malloc(sizeof(int)*b_c); if (c[i] == NULL) { printf("Allocation of matrix c[%d] failed \n",i); exit(0); } } /* Memory is lock*/ printf("Starting mlockall.\n"); if(mlockall(MCL_CURRENT | MCL_FUTURE)<0) { perror("mlockall"); return 1; } /* * Matrix initialization (with random integer). */ printf("Initializing matrices...\n"); //initializing first matrix srand(time(NULL)); for(i=0;i<a_r; i++) { for(j=0;j<a_c; j++) { a[i][j] = Random(0,100);//i+j; //printf("%d \n", a[i][j]); } } // initializing second matrix srand(time(NULL)); for(i=0;i<b_r; i++) { for(j=0;j<b_c; j++) { b[i][j] = Random(0,100);//i*j; } } /*initialize product matrix */ srand(time(NULL)); for(i=0;i<a_r; i++) { for(j=0;j< b_c; j++) { c[i][j]= Random(0,100);//0; } } end= clock(); //end the timer dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference printf("Malloc and initialization took %f sec. time.\n", dif); /* * Matrix Multiplication. */ start =clock(); //start the timer start1 = give_ru(); //start the timer /* multiply matrix one and matrix two */ for(i=0;i<a_r; i++) for(j=0;j<a_c; j++) for(k=0;k<b_c; k++) c[i][j]=c[i][j]+a[i][k]*b[k][j]; end1 = give_ru(); //end the timer end = clock(); //end the timer dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference struct timeval timeUserStart = start1.ru_utime; double utimeStart = (double)timeUserStart.tv_sec + (double)timeUserStart.tv_usec / 1000000.0; struct timeval timeSystemStart = start1.ru_stime; double stimeStart = (double)timeSystemStart.tv_sec + (double)timeSystemStart.tv_usec / 1000000.0; struct timeval timeUserEnd = end1.ru_utime; double utimeEnd = (double)timeUserEnd.tv_sec + (double)timeUserEnd.tv_usec / 1000000.0; struct timeval timeSystemEnd = end1.ru_stime; double stimeEnd = (double)timeSystemEnd.tv_sec + (double)timeSystemEnd.tv_usec / 1000000.0; double difuTime = utimeEnd - utimeStart; double difsTime = stimeEnd - stimeStart; long pageReclaims = end1.ru_minflt;//start1.ru_minflt- long pageFaults = end1.ru_majflt;//start1.ru_majflt- printf("Parallelization took %f sec. time.\n", dif); printf("Time User : %f .\n", difuTime ); printf("Time System : %f .\n", difsTime ); printf("Page reclaims : %ld .\n", end1.ru_minflt); printf("Page faults : %ld .\n", end1.ru_majflt); /* * free memory. */ printf("Finished, cleaning up \n"); munlockall(); /*free memory*/ for(i=0;i<a_r; i++) { free(a[i]); a[i] = NULL; } free(a); a = NULL; for(i=0;i<a_c; i++) { free(b[i]); b[i] = NULL; } free(b); b = NULL; for(i=0;i<b_c; i++) { free(c[i]); c[i] = NULL; } free(c); c = NULL; return 0; } 
+6
source share
2 answers

I will find out part of the answer. "Recent Linux kernels support transparent huge pages, programs will use it without knowing what they are doing." If I turn off huge pages, all three tests will take 260 seconds.

For tests 1 and 3, they always use huge pages. Here you can see the number of AnonHugePages in tests 1 and 3: AnonHugePages in test 1 and 3

For test 2, it uses huge pages, but not in all cases, and not from the very beginning of the program: AnonHugePages in test 2

I still don't understand why test 2 is still irregular. However, test 3 uses the mmap () system call to allocate memory, and test 2 uses brk (). Maybe the answer is here, I will continue the investigation.

(I forgot to say this, but the program starts three times.)

0
source

A pointer array requires an additional level of indirection and therefore will be slower. Moreover, it can cause more cache misses.

So, I tried with the matrix 1000 * 1000:

Test 2 (int **)

 $ perf stat ./test2 Performance counter stats for './test1': 8561,688348 task-clock (msec) # 1,000 CPUs utilized 13 context-switches # 0,002 K/sec 9 cpu-migrations # 0,001 K/sec 3 058 page-faults # 0,357 K/sec 24 844 304 630 cycles # 2,902 GHz [83,32%] 21 302 837 742 stalled-cycles-frontend # 85,75% frontend cycles idle [83,32%] 2 110 745 845 stalled-cycles-backend # 8,50% backend cycles idle [66,68%] 7 030 427 722 instructions # 0,28 insns per cycle # 3,03 stalled cycles per insn [83,36%] 1 004 889 984 branches # 117,371 M/sec [83,37%] 1 077 360 branch-misses # 0,11% of all branches [83,32%] 8,561758966 seconds time elapsed 

Test 3 (int *)

 $ perf stat ./test3 Performance counter stats for './test2': 1367,856713 task-clock (msec) # 0,995 CPUs utilized 195 context-switches # 0,143 K/sec 3 cpu-migrations # 0,002 K/sec 3 001 page-faults # 0,002 M/sec 3 977 759 335 cycles # 2,908 GHz [83,41%] 975 477 913 stalled-cycles-frontend # 24,52% frontend cycles idle [83,41%] 179 003 530 stalled-cycles-backend # 4,50% backend cycles idle [66,81%] 7 017 803 848 instructions # 1,76 insns per cycle # 0,14 stalled cycles per insn [83,41%] 1 002 321 021 branches # 732,768 M/sec [83,42%] 1 046 066 branch-misses # 0,10% of all branches [82,97%] 1,374613079 seconds time elapsed 

As we can see, there are many more int test2 loops.

I also missed the missing cache:

Test 2 (int **)

 $ perf stat -e 'L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses' ./test2 Performance counter stats for './test1': 3 009 028 415 L1-dcache-loads 1 259 622 058 L1-dcache-load-misses # 41,86% of all L1-dcache hits 6 427 561 L1-dcache-stores 1 141 461 L1-dcache-store-misses 8,650905202 seconds time elapsed 

Test 3 (int *)

 $ perf stat -e 'L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses' ./test3 Performance counter stats for './test2': 2 004 807 223 L1-dcache-loads 596 388 192 L1-dcache-load-misses # 29,75% of all L1-dcache hits 2 111 264 L1-dcache-stores 384 198 L1-dcache-store-misses 1,384352257 seconds time elapsed 
+3
source

All Articles