The following lines of code
int nrows = 4096; int ncols = 4096; size_t numel = nrows * ncols; unsigned char *buff = (unsigned char *) malloc( numel ); unsigned char *pbuff = buff; #pragma omp parallel for schedule(static), firstprivate(pbuff, nrows, ncols), num_threads(1) for (int i=0; i<nrows; i++) { for (int j=0; j<ncols; j++) { *pbuff += 1; pbuff++; } }
take 11130 usec to work on my i5-3230M when compiling with
g++ -o main main.cpp -std=c++0x -O3
That is, when the openmp pragmas are ignored.
On the other hand, when compiling with
it only accepts 1496 usecs,
g++ -o main main.cpp -std=c++0x -O3 -fopenmp
This is more than 6 times faster, which is pretty surprising considering that it runs on a dual-core machine. In fact, I also tested it with num_threads (1), and performance improvement is still very important (more than 3 times faster).
Can anyone help me understand this behavior?
EDIT: following the guidelines, I provide the complete code snippet:
#include <stdlib.h> #include <iostream> #include <chrono> #include <cassert> int nrows = 4096; int ncols = 4096; size_t numel = nrows * ncols; unsigned char * buff; void func() { unsigned char *pbuff = buff; #pragma omp parallel for schedule(static), firstprivate(pbuff, nrows, ncols), num_threads(1) for (int i=0; i<nrows; i++) { for (int j=0; j<ncols; j++) { *pbuff += 1; pbuff++; } } } int main() { // alloc & initializacion buff = (unsigned char *) malloc( numel ); assert(buff != NULL); for(int k=0; k<numel; k++) buff[k] = 0; // std::chrono::high_resolution_clock::time_point begin; std::chrono::high_resolution_clock::time_point end; begin = std::chrono::high_resolution_clock::now(); // for(int k=0; k<100; k++) func(); // end = std::chrono::high_resolution_clock::now(); auto usec = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count(); std::cout << "func average running time: " << usec/100 << " usecs" << std::endl; return 0; }
c ++ openmp
nuhnuh
source share