Why is my code much slower than opencv for a simple StereoBM algorithm?

Question

Why is my code much slower than opencv for a simple StereoBM algorithm?

This is my test code for implementing a simple testBM algorithm, without prior filtering. But it takes about 400 ms or even more when the window size is larger, and StereoBM opencv (CPU not GPU) takes 20 ms. I checked the source of StereoBM, but it's hard for me to understand. Does anyone know why?

And below is my code.

void testBM(const Mat &left0, const Mat &right0, Mat &disparity, int SAD, int searchRange) { int cols = left0.cols; int rows = left0.rows; int total = cols*rows; const uchar* data_left = left0.ptr<uchar>(0); const uchar* data_right = right0.ptr<uchar>(0); uchar* data_dm = new uchar[total]; int dbNum = 2 * SAD + 1; int dNum = dbNum * dbNum; //x is col index in the dbNum * dbNum window //y is row index in this window //z is (x + y * cols). //I compute them in advance for avoid computing repeatedly. Point3i *dLocDif = new Point3i[dNum]; for (int i = 0; i < dNum; i++) { dLocDif[i] = Point3i(i%dbNum - SAD, i / dbNum - SAD, 0); dLocDif[i].z = dLocDif[i].x + dLocDif[i].y * cols; } //I compute disparity difference for eache search range to avoid //computing repeatedly. uchar* dif_ = new uchar[total*searchRange]; for (int _search = 0; _search < searchRange; _search++) { int th = _search * total; for (int i = 0; i < total; i++) { int c = i % cols - _search; if (c < 0) continue; dif_[i+th] = (uchar)std::abs(data_left[i] - data_right[i-_search]); } } for (int p = 0; p < total; p++) { int min = 50 * dNum; int dm = -256; int _col = p % cols; int _row = p / cols; int th = 0; //I search for the smallest difference between left and right image // using def_. for (int _search = 0; _search < searchRange; _search++, th += total) { if (_col + _search > cols) break; int temp = 0; for (int i = 0; i < dNum; i++) { int _c = _col + dLocDif[i].x; if (_c >= cols || _c < 0) continue; int _r = _row + dLocDif[i].y; if (_r >= rows || _r < 0) continue; temp += dif_[th + p + dLocDif[i].z]; if (temp > min) { break; } } if (temp < min) { dm = _search; min = temp; } } data_dm[p] = dm; } disparity = Mat(rows, cols, CV_8UC1, data_dm); }

Here is the main StereoBM source code in opencv. I got a little confused after initialization. Can someone explain briefly?

 static void findStereoCorrespondenceBM( const Mat& left, const Mat& right, Mat& disp, Mat& cost, const CvStereoBMState& state, uchar* buf, int _dy0, int _dy1 ) { const int ALIGN = 16; int x, y, d; int wsz = state.SADWindowSize, wsz2 = wsz/2; int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1); int ndisp = state.numberOfDisparities; int mindisp = state.minDisparity; int lofs = MAX(ndisp - 1 + mindisp, 0); int rofs = -MIN(ndisp - 1 + mindisp, 0); int width = left.cols, height = left.rows; int width1 = width - rofs - ndisp + 1; int ftzero = state.preFilterCap; int textureThreshold = state.textureThreshold; int uniquenessRatio = state.uniquenessRatio; short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT); int *sad, *hsad0, *hsad, *hsad_sub, *htext; uchar *cbuf0, *cbuf; const uchar* lptr0 = left.data + lofs; const uchar* rptr0 = right.data + rofs; const uchar *lptr, *lptr_sub, *rptr; short* dptr = (short*)disp.data; int sstep = (int)left.step; int dstep = (int)(disp.step/sizeof(dptr[0])); int cstep = (height+dy0+dy1)*ndisp; int costbuf = 0; int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0; const int TABSZ = 256; uchar tab[TABSZ]; sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN); hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN); htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN); cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN); for( x = 0; x < TABSZ; x++ ) tab[x] = (uchar)std::abs(x - ftzero); // initialize buffers memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) ); memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) ); for( x = -wsz2-1; x < wsz2; x++ ) { hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp; lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep; rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-1) - dy0*sstep; for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep ) { int lval = lptr[0]; for( d = 0; d < ndisp; d++ ) { int diff = std::abs(lval - rptr[d]); cbuf[d] = (uchar)diff; hsad[d] = (int)(hsad[d] + diff); } htext[y] += tab[lval]; } } // initialize the left and right borders of the disparity map for( y = 0; y < height; y++ ) { for( x = 0; x < lofs; x++ ) dptr[y*dstep + x] = FILTERED; for( x = lofs + width1; x < width; x++ ) dptr[y*dstep + x] = FILTERED; } dptr += lofs; for( x = 0; x < width1; x++, dptr++ ) { int* costptr = cost.data ? (int*)cost.data + lofs + x : &costbuf; int x0 = x - wsz2 - 1, x1 = x + wsz2; const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp; cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp; hsad = hsad0 - dy0*ndisp; lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep; lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep; rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep; for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp, hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep ) { int lval = lptr[0]; for( d = 0; d < ndisp; d++ ) { int diff = std::abs(lval - rptr[d]); cbuf[d] = (uchar)diff; hsad[d] = hsad[d] + diff - cbuf_sub[d]; } htext[y] += tab[lval] - tab[lptr_sub[0]]; } // fill borders for( y = dy1; y <= wsz2; y++ ) htext[height+y] = htext[height+dy1-1]; for( y = -wsz2-1; y < -dy0; y++ ) htext[y] = htext[-dy0]; // initialize sums for( d = 0; d < ndisp; d++ ) sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0)); hsad = hsad0 + (1 - dy0)*ndisp; for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp ) for( d = 0; d < ndisp; d++ ) sad[d] = (int)(sad[d] + hsad[d]); int tsum = 0; for( y = -wsz2-1; y < wsz2; y++ ) tsum += htext[y]; // finally, start the real processing for( y = 0; y < height; y++ ) { int minsad = INT_MAX, mind = -1; hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp; hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp; for( d = 0; d < ndisp; d++ ) { int currsad = sad[d] + hsad[d] - hsad_sub[d]; sad[d] = currsad; if( currsad < minsad ) { minsad = currsad; mind = d; } } tsum += htext[y + wsz2] - htext[y - wsz2 - 1]; if( tsum < textureThreshold ) { dptr[y*dstep] = FILTERED; continue; } if( uniquenessRatio > 0 ) { int thresh = minsad + (minsad * uniquenessRatio/100); for( d = 0; d < ndisp; d++ ) { if( sad[d] <= thresh && (d < mind-1 || d > mind+1)) break; } if( d < ndisp ) { dptr[y*dstep] = FILTERED; continue; } } { sad[-1] = sad[1]; sad[ndisp] = sad[ndisp-2]; int p = sad[mind+1], n = sad[mind-1]; d = p + n - 2*sad[mind] + std::abs(p - n); dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (pn)*256/d : 0) + 15) >> 4); costptr[y*coststep] = sad[mind]; } } } }

+8

c ++ algorithm opencv stereo-3d

Seven Apr 26 '15 at 14:16

source share

1 answer

Tim b · Accepted Answer · 2015-10-17T09:25:15+0000

OpenCV runs many algorithms in parallel; parallel_for / do annotation of TBB, PPL and OpenMP.

The original image is subdivided into subregions, and findStereoCorrespondenceBM() is executed for each of these subregions. This is possible thanks to the interface that we see, since cv::Mat can be used as a representation for a sub-image without having to copy the actual pixel data. You can verify this by looking at the processors used (for example, using the process explorer on windows or on top on unix) during program execution.

(Originally posted by Hauke Heibel as a comment)

Why is my code much slower than opencv for a simple StereoBM algorithm?

More articles: