1 //M*////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000, Intel Corporation, all rights reserved. 14 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 /****************************************************************************************\ 44 * Very fast SAD-based (Sum-of-Absolute-Diffrences) stereo correspondence algorithm. * 45 * Contributed by Kurt Konolige * 46 \****************************************************************************************/ 47 48 #include "precomp.hpp" 49 #include <stdio.h> 50 #include <limits> 51 #include "opencl_kernels_calib3d.hpp" 52 53 namespace cv 54 { 55 56 struct StereoBMParams 57 { 58 StereoBMParams(int _numDisparities=64, int _SADWindowSize=21) 59 { 60 preFilterType = StereoBM::PREFILTER_XSOBEL; 61 preFilterSize = 9; 62 preFilterCap = 31; 63 SADWindowSize = _SADWindowSize; 64 minDisparity = 0; 65 numDisparities = _numDisparities > 0 ? _numDisparities : 64; 66 textureThreshold = 10; 67 uniquenessRatio = 15; 68 speckleRange = speckleWindowSize = 0; 69 roi1 = roi2 = Rect(0,0,0,0); 70 disp12MaxDiff = -1; 71 dispType = CV_16S; 72 } 73 74 int preFilterType; 75 int preFilterSize; 76 int preFilterCap; 77 int SADWindowSize; 78 int minDisparity; 79 int numDisparities; 80 int textureThreshold; 81 int uniquenessRatio; 82 int speckleRange; 83 int speckleWindowSize; 84 Rect roi1, roi2; 85 int disp12MaxDiff; 86 int dispType; 87 }; 88 89 static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsize, int prefilterCap) 90 { 91 ocl::Kernel k("prefilter_norm", ocl::calib3d::stereobm_oclsrc, cv::format("-D WSZ=%d", winsize)); 92 if(k.empty()) 93 return false; 94 95 int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2); 96 scale_g *= scale_s; 97 98 UMat input = _input.getUMat(), output; 99 _output.create(input.size(), input.type()); 100 output = _output.getUMat(); 101 102 size_t globalThreads[3] = { input.cols, input.rows, 1 }; 103 104 k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols, 105 prefilterCap, scale_g, scale_s); 106 107 return k.run(2, globalThreads, NULL, false); 108 } 109 110 static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf ) 111 { 112 int x, y, wsz2 = winsize/2; 113 int* vsum = (int*)alignPtr(buf + (wsz2 + 1)*sizeof(vsum[0]), 32); 114 int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2); 115 const int OFS = 256*5, TABSZ = OFS*2 + 256; 116 uchar tab[TABSZ]; 117 const uchar* sptr = src.ptr(); 118 int srcstep = (int)src.step; 119 Size size = src.size(); 120 121 scale_g *= scale_s; 122 123 for( x = 0; x < TABSZ; x++ ) 124 tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero); 125 126 for( x = 0; x < size.width; x++ ) 127 vsum[x] = (ushort)(sptr[x]*(wsz2 + 2)); 128 129 for( y = 1; y < wsz2; y++ ) 130 { 131 for( x = 0; x < size.width; x++ ) 132 vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]); 133 } 134 135 for( y = 0; y < size.height; y++ ) 136 { 137 const uchar* top = sptr + srcstep*MAX(y-wsz2-1,0); 138 const uchar* bottom = sptr + srcstep*MIN(y+wsz2,size.height-1); 139 const uchar* prev = sptr + srcstep*MAX(y-1,0); 140 const uchar* curr = sptr + srcstep*y; 141 const uchar* next = sptr + srcstep*MIN(y+1,size.height-1); 142 uchar* dptr = dst.ptr<uchar>(y); 143 144 for( x = 0; x < size.width; x++ ) 145 vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]); 146 147 for( x = 0; x <= wsz2; x++ ) 148 { 149 vsum[-x-1] = vsum[0]; 150 vsum[size.width+x] = vsum[size.width-1]; 151 } 152 153 int sum = vsum[0]*(wsz2 + 1); 154 for( x = 1; x <= wsz2; x++ ) 155 sum += vsum[x]; 156 157 int val = ((curr[0]*5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10; 158 dptr[0] = tab[val + OFS]; 159 160 for( x = 1; x < size.width-1; x++ ) 161 { 162 sum += vsum[x+wsz2] - vsum[x-wsz2-1]; 163 val = ((curr[x]*4 + curr[x-1] + curr[x+1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10; 164 dptr[x] = tab[val + OFS]; 165 } 166 167 sum += vsum[x+wsz2] - vsum[x-wsz2-1]; 168 val = ((curr[x]*5 + curr[x-1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10; 169 dptr[x] = tab[val + OFS]; 170 } 171 } 172 173 static bool ocl_prefilter_xsobel(InputArray _input, OutputArray _output, int prefilterCap) 174 { 175 ocl::Kernel k("prefilter_xsobel", ocl::calib3d::stereobm_oclsrc); 176 if(k.empty()) 177 return false; 178 179 UMat input = _input.getUMat(), output; 180 _output.create(input.size(), input.type()); 181 output = _output.getUMat(); 182 183 size_t globalThreads[3] = { input.cols, input.rows, 1 }; 184 185 k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols, prefilterCap); 186 187 return k.run(2, globalThreads, NULL, false); 188 } 189 190 static void 191 prefilterXSobel( const Mat& src, Mat& dst, int ftzero ) 192 { 193 int x, y; 194 const int OFS = 256*4, TABSZ = OFS*2 + 256; 195 uchar tab[TABSZ]; 196 Size size = src.size(); 197 198 for( x = 0; x < TABSZ; x++ ) 199 tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero); 200 uchar val0 = tab[0 + OFS]; 201 202 #if CV_SSE2 203 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); 204 #endif 205 206 for( y = 0; y < size.height-1; y += 2 ) 207 { 208 const uchar* srow1 = src.ptr<uchar>(y); 209 const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1; 210 const uchar* srow2 = y < size.height-1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1; 211 const uchar* srow3 = y < size.height-2 ? srow1 + src.step*2 : srow1; 212 uchar* dptr0 = dst.ptr<uchar>(y); 213 uchar* dptr1 = dptr0 + dst.step; 214 215 dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0; 216 x = 1; 217 218 #if CV_NEON 219 int16x8_t ftz = vdupq_n_s16 ((short) ftzero); 220 uint8x8_t ftz2 = vdup_n_u8 (cv::saturate_cast<uchar>(ftzero*2)); 221 222 for(; x <=size.width-9; x += 8 ) 223 { 224 uint8x8_t c0 = vld1_u8 (srow0 + x - 1); 225 uint8x8_t c1 = vld1_u8 (srow1 + x - 1); 226 uint8x8_t d0 = vld1_u8 (srow0 + x + 1); 227 uint8x8_t d1 = vld1_u8 (srow1 + x + 1); 228 229 int16x8_t t0 = vreinterpretq_s16_u16 (vsubl_u8 (d0, c0)); 230 int16x8_t t1 = vreinterpretq_s16_u16 (vsubl_u8 (d1, c1)); 231 232 uint8x8_t c2 = vld1_u8 (srow2 + x - 1); 233 uint8x8_t c3 = vld1_u8 (srow3 + x - 1); 234 uint8x8_t d2 = vld1_u8 (srow2 + x + 1); 235 uint8x8_t d3 = vld1_u8 (srow3 + x + 1); 236 237 int16x8_t t2 = vreinterpretq_s16_u16 (vsubl_u8 (d2, c2)); 238 int16x8_t t3 = vreinterpretq_s16_u16 (vsubl_u8 (d3, c3)); 239 240 int16x8_t v0 = vaddq_s16 (vaddq_s16 (t2, t0), vaddq_s16 (t1, t1)); 241 int16x8_t v1 = vaddq_s16 (vaddq_s16 (t3, t1), vaddq_s16 (t2, t2)); 242 243 244 uint8x8_t v0_u8 = vqmovun_s16 (vaddq_s16 (v0, ftz)); 245 uint8x8_t v1_u8 = vqmovun_s16 (vaddq_s16 (v1, ftz)); 246 v0_u8 = vmin_u8 (v0_u8, ftz2); 247 v1_u8 = vmin_u8 (v1_u8, ftz2); 248 vqmovun_s16 (vaddq_s16 (v1, ftz)); 249 250 vst1_u8 (dptr0 + x, v0_u8); 251 vst1_u8 (dptr1 + x, v1_u8); 252 } 253 #elif CV_SSE2 254 if( useSIMD ) 255 { 256 __m128i z = _mm_setzero_si128(), ftz = _mm_set1_epi16((short)ftzero), 257 ftz2 = _mm_set1_epi8(cv::saturate_cast<uchar>(ftzero*2)); 258 for( ; x <= size.width-9; x += 8 ) 259 { 260 __m128i c0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x - 1)), z); 261 __m128i c1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x - 1)), z); 262 __m128i d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x + 1)), z); 263 __m128i d1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x + 1)), z); 264 265 d0 = _mm_sub_epi16(d0, c0); 266 d1 = _mm_sub_epi16(d1, c1); 267 268 __m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z); 269 __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z); 270 __m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z); 271 __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z); 272 273 d2 = _mm_sub_epi16(d2, c2); 274 d3 = _mm_sub_epi16(d3, c3); 275 276 __m128i v0 = _mm_add_epi16(d0, _mm_add_epi16(d2, _mm_add_epi16(d1, d1))); 277 __m128i v1 = _mm_add_epi16(d1, _mm_add_epi16(d3, _mm_add_epi16(d2, d2))); 278 v0 = _mm_packus_epi16(_mm_add_epi16(v0, ftz), _mm_add_epi16(v1, ftz)); 279 v0 = _mm_min_epu8(v0, ftz2); 280 281 _mm_storel_epi64((__m128i*)(dptr0 + x), v0); 282 _mm_storel_epi64((__m128i*)(dptr1 + x), _mm_unpackhi_epi64(v0, v0)); 283 } 284 } 285 #endif 286 287 for( ; x < size.width-1; x++ ) 288 { 289 int d0 = srow0[x+1] - srow0[x-1], d1 = srow1[x+1] - srow1[x-1], 290 d2 = srow2[x+1] - srow2[x-1], d3 = srow3[x+1] - srow3[x-1]; 291 int v0 = tab[d0 + d1*2 + d2 + OFS]; 292 int v1 = tab[d1 + d2*2 + d3 + OFS]; 293 dptr0[x] = (uchar)v0; 294 dptr1[x] = (uchar)v1; 295 } 296 } 297 298 #if CV_NEON 299 uint8x16_t val0_16 = vdupq_n_u8 (val0); 300 #endif 301 302 for( ; y < size.height; y++ ) 303 { 304 uchar* dptr = dst.ptr<uchar>(y); 305 x = 0; 306 #if CV_NEON 307 for(; x <= size.width-16; x+=16 ) 308 vst1q_u8 (dptr + x, val0_16); 309 #endif 310 for(; x < size.width; x++ ) 311 dptr[x] = val0; 312 } 313 } 314 315 316 static const int DISPARITY_SHIFT = 4; 317 318 #if CV_SSE2 319 static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right, 320 Mat& disp, Mat& cost, StereoBMParams& state, 321 uchar* buf, int _dy0, int _dy1 ) 322 { 323 const int ALIGN = 16; 324 int x, y, d; 325 int wsz = state.SADWindowSize, wsz2 = wsz/2; 326 int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1); 327 int ndisp = state.numDisparities; 328 int mindisp = state.minDisparity; 329 int lofs = MAX(ndisp - 1 + mindisp, 0); 330 int rofs = -MIN(ndisp - 1 + mindisp, 0); 331 int width = left.cols, height = left.rows; 332 int width1 = width - rofs - ndisp + 1; 333 int ftzero = state.preFilterCap; 334 int textureThreshold = state.textureThreshold; 335 int uniquenessRatio = state.uniquenessRatio; 336 short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT); 337 338 ushort *sad, *hsad0, *hsad, *hsad_sub; 339 int *htext; 340 uchar *cbuf0, *cbuf; 341 const uchar* lptr0 = left.ptr() + lofs; 342 const uchar* rptr0 = right.ptr() + rofs; 343 const uchar *lptr, *lptr_sub, *rptr; 344 short* dptr = disp.ptr<short>(); 345 int sstep = (int)left.step; 346 int dstep = (int)(disp.step/sizeof(dptr[0])); 347 int cstep = (height + dy0 + dy1)*ndisp; 348 short costbuf = 0; 349 int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0; 350 const int TABSZ = 256; 351 uchar tab[TABSZ]; 352 const __m128i d0_8 = _mm_setr_epi16(0,1,2,3,4,5,6,7), dd_8 = _mm_set1_epi16(8); 353 354 sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN); 355 hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN); 356 htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN); 357 cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN); 358 359 for( x = 0; x < TABSZ; x++ ) 360 tab[x] = (uchar)std::abs(x - ftzero); 361 362 // initialize buffers 363 memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) ); 364 memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) ); 365 366 for( x = -wsz2-1; x < wsz2; x++ ) 367 { 368 hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp; 369 lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep; 370 rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-1) - dy0*sstep; 371 372 for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep ) 373 { 374 int lval = lptr[0]; 375 __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128(); 376 for( d = 0; d < ndisp; d += 16 ) 377 { 378 __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d)); 379 __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d)); 380 __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8)); 381 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv)); 382 _mm_store_si128((__m128i*)(cbuf + d), diff); 383 hsad_l = _mm_add_epi16(hsad_l, _mm_unpacklo_epi8(diff,z)); 384 hsad_h = _mm_add_epi16(hsad_h, _mm_unpackhi_epi8(diff,z)); 385 _mm_store_si128((__m128i*)(hsad + d), hsad_l); 386 _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h); 387 } 388 htext[y] += tab[lval]; 389 } 390 } 391 392 // initialize the left and right borders of the disparity map 393 for( y = 0; y < height; y++ ) 394 { 395 for( x = 0; x < lofs; x++ ) 396 dptr[y*dstep + x] = FILTERED; 397 for( x = lofs + width1; x < width; x++ ) 398 dptr[y*dstep + x] = FILTERED; 399 } 400 dptr += lofs; 401 402 for( x = 0; x < width1; x++, dptr++ ) 403 { 404 short* costptr = cost.data ? cost.ptr<short>() + lofs + x : &costbuf; 405 int x0 = x - wsz2 - 1, x1 = x + wsz2; 406 const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp; 407 cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp; 408 hsad = hsad0 - dy0*ndisp; 409 lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep; 410 lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep; 411 rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep; 412 413 for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp, 414 hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep ) 415 { 416 int lval = lptr[0]; 417 __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128(); 418 for( d = 0; d < ndisp; d += 16 ) 419 { 420 __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d)); 421 __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d)); 422 __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8)); 423 __m128i cbs = _mm_load_si128((const __m128i*)(cbuf_sub + d)); 424 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv)); 425 __m128i diff_h = _mm_sub_epi16(_mm_unpackhi_epi8(diff, z), _mm_unpackhi_epi8(cbs, z)); 426 _mm_store_si128((__m128i*)(cbuf + d), diff); 427 diff = _mm_sub_epi16(_mm_unpacklo_epi8(diff, z), _mm_unpacklo_epi8(cbs, z)); 428 hsad_h = _mm_add_epi16(hsad_h, diff_h); 429 hsad_l = _mm_add_epi16(hsad_l, diff); 430 _mm_store_si128((__m128i*)(hsad + d), hsad_l); 431 _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h); 432 } 433 htext[y] += tab[lval] - tab[lptr_sub[0]]; 434 } 435 436 // fill borders 437 for( y = dy1; y <= wsz2; y++ ) 438 htext[height+y] = htext[height+dy1-1]; 439 for( y = -wsz2-1; y < -dy0; y++ ) 440 htext[y] = htext[-dy0]; 441 442 // initialize sums 443 for( d = 0; d < ndisp; d++ ) 444 sad[d] = (ushort)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0)); 445 446 hsad = hsad0 + (1 - dy0)*ndisp; 447 for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp ) 448 for( d = 0; d < ndisp; d += 16 ) 449 { 450 __m128i s0 = _mm_load_si128((__m128i*)(sad + d)); 451 __m128i s1 = _mm_load_si128((__m128i*)(sad + d + 8)); 452 __m128i t0 = _mm_load_si128((__m128i*)(hsad + d)); 453 __m128i t1 = _mm_load_si128((__m128i*)(hsad + d + 8)); 454 s0 = _mm_add_epi16(s0, t0); 455 s1 = _mm_add_epi16(s1, t1); 456 _mm_store_si128((__m128i*)(sad + d), s0); 457 _mm_store_si128((__m128i*)(sad + d + 8), s1); 458 } 459 int tsum = 0; 460 for( y = -wsz2-1; y < wsz2; y++ ) 461 tsum += htext[y]; 462 463 // finally, start the real processing 464 for( y = 0; y < height; y++ ) 465 { 466 int minsad = INT_MAX, mind = -1; 467 hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp; 468 hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp; 469 __m128i minsad8 = _mm_set1_epi16(SHRT_MAX); 470 __m128i mind8 = _mm_set1_epi16(0), d8 = d0_8, mask; 471 472 for( d = 0; d < ndisp; d += 16 ) 473 { 474 __m128i u0 = _mm_load_si128((__m128i*)(hsad_sub + d)); 475 __m128i u1 = _mm_load_si128((__m128i*)(hsad + d)); 476 477 __m128i v0 = _mm_load_si128((__m128i*)(hsad_sub + d + 8)); 478 __m128i v1 = _mm_load_si128((__m128i*)(hsad + d + 8)); 479 480 __m128i usad8 = _mm_load_si128((__m128i*)(sad + d)); 481 __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8)); 482 483 u1 = _mm_sub_epi16(u1, u0); 484 v1 = _mm_sub_epi16(v1, v0); 485 usad8 = _mm_add_epi16(usad8, u1); 486 vsad8 = _mm_add_epi16(vsad8, v1); 487 488 mask = _mm_cmpgt_epi16(minsad8, usad8); 489 minsad8 = _mm_min_epi16(minsad8, usad8); 490 mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8)); 491 492 _mm_store_si128((__m128i*)(sad + d), usad8); 493 _mm_store_si128((__m128i*)(sad + d + 8), vsad8); 494 495 mask = _mm_cmpgt_epi16(minsad8, vsad8); 496 minsad8 = _mm_min_epi16(minsad8, vsad8); 497 498 d8 = _mm_add_epi16(d8, dd_8); 499 mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8)); 500 d8 = _mm_add_epi16(d8, dd_8); 501 } 502 503 tsum += htext[y + wsz2] - htext[y - wsz2 - 1]; 504 if( tsum < textureThreshold ) 505 { 506 dptr[y*dstep] = FILTERED; 507 continue; 508 } 509 510 ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8]; 511 _mm_store_si128((__m128i*)minsad_buf, minsad8); 512 _mm_store_si128((__m128i*)mind_buf, mind8); 513 for( d = 0; d < 8; d++ ) 514 if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d])) 515 { 516 minsad = minsad_buf[d]; 517 mind = mind_buf[d]; 518 } 519 520 if( uniquenessRatio > 0 ) 521 { 522 int thresh = minsad + (minsad * uniquenessRatio/100); 523 __m128i thresh8 = _mm_set1_epi16((short)(thresh + 1)); 524 __m128i d1 = _mm_set1_epi16((short)(mind-1)), d2 = _mm_set1_epi16((short)(mind+1)); 525 __m128i dd_16 = _mm_add_epi16(dd_8, dd_8); 526 d8 = _mm_sub_epi16(d0_8, dd_16); 527 528 for( d = 0; d < ndisp; d += 16 ) 529 { 530 __m128i usad8 = _mm_load_si128((__m128i*)(sad + d)); 531 __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8)); 532 mask = _mm_cmpgt_epi16( thresh8, _mm_min_epi16(usad8,vsad8)); 533 d8 = _mm_add_epi16(d8, dd_16); 534 if( !_mm_movemask_epi8(mask) ) 535 continue; 536 mask = _mm_cmpgt_epi16( thresh8, usad8); 537 mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,d8), _mm_cmpgt_epi16(d8,d2))); 538 if( _mm_movemask_epi8(mask) ) 539 break; 540 __m128i t8 = _mm_add_epi16(d8, dd_8); 541 mask = _mm_cmpgt_epi16( thresh8, vsad8); 542 mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,t8), _mm_cmpgt_epi16(t8,d2))); 543 if( _mm_movemask_epi8(mask) ) 544 break; 545 } 546 if( d < ndisp ) 547 { 548 dptr[y*dstep] = FILTERED; 549 continue; 550 } 551 } 552 553 if( 0 < mind && mind < ndisp - 1 ) 554 { 555 int p = sad[mind+1], n = sad[mind-1]; 556 d = p + n - 2*sad[mind] + std::abs(p - n); 557 dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4); 558 } 559 else 560 dptr[y*dstep] = (short)((ndisp - mind - 1 + mindisp)*16); 561 costptr[y*coststep] = sad[mind]; 562 } 563 } 564 } 565 #endif 566 567 static void 568 findStereoCorrespondenceBM( const Mat& left, const Mat& right, 569 Mat& disp, Mat& cost, const StereoBMParams& state, 570 uchar* buf, int _dy0, int _dy1 ) 571 { 572 573 const int ALIGN = 16; 574 int x, y, d; 575 int wsz = state.SADWindowSize, wsz2 = wsz/2; 576 int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1); 577 int ndisp = state.numDisparities; 578 int mindisp = state.minDisparity; 579 int lofs = MAX(ndisp - 1 + mindisp, 0); 580 int rofs = -MIN(ndisp - 1 + mindisp, 0); 581 int width = left.cols, height = left.rows; 582 int width1 = width - rofs - ndisp + 1; 583 int ftzero = state.preFilterCap; 584 int textureThreshold = state.textureThreshold; 585 int uniquenessRatio = state.uniquenessRatio; 586 short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT); 587 588 #if CV_NEON 589 CV_Assert (ndisp % 8 == 0); 590 int32_t d0_4_temp [4]; 591 for (int i = 0; i < 4; i ++) 592 d0_4_temp[i] = i; 593 int32x4_t d0_4 = vld1q_s32 (d0_4_temp); 594 int32x4_t dd_4 = vdupq_n_s32 (4); 595 #endif 596 597 int *sad, *hsad0, *hsad, *hsad_sub, *htext; 598 uchar *cbuf0, *cbuf; 599 const uchar* lptr0 = left.ptr() + lofs; 600 const uchar* rptr0 = right.ptr() + rofs; 601 const uchar *lptr, *lptr_sub, *rptr; 602 short* dptr = disp.ptr<short>(); 603 int sstep = (int)left.step; 604 int dstep = (int)(disp.step/sizeof(dptr[0])); 605 int cstep = (height+dy0+dy1)*ndisp; 606 int costbuf = 0; 607 int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0; 608 const int TABSZ = 256; 609 uchar tab[TABSZ]; 610 611 sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN); 612 hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN); 613 htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN); 614 cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN); 615 616 for( x = 0; x < TABSZ; x++ ) 617 tab[x] = (uchar)std::abs(x - ftzero); 618 619 // initialize buffers 620 memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) ); 621 memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) ); 622 623 for( x = -wsz2-1; x < wsz2; x++ ) 624 { 625 hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp; 626 lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep; 627 rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-1) - dy0*sstep; 628 for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep ) 629 { 630 int lval = lptr[0]; 631 #if CV_NEON 632 int16x8_t lv = vdupq_n_s16 ((int16_t)lval); 633 634 for( d = 0; d < ndisp; d += 8 ) 635 { 636 int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d))); 637 int32x4_t hsad_l = vld1q_s32 (hsad + d); 638 int32x4_t hsad_h = vld1q_s32 (hsad + d + 4); 639 int16x8_t diff = vabdq_s16 (lv, rv); 640 vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff))); 641 hsad_l = vaddq_s32 (hsad_l, vmovl_s16(vget_low_s16 (diff))); 642 hsad_h = vaddq_s32 (hsad_h, vmovl_s16(vget_high_s16 (diff))); 643 vst1q_s32 ((hsad + d), hsad_l); 644 vst1q_s32 ((hsad + d + 4), hsad_h); 645 } 646 #else 647 for( d = 0; d < ndisp; d++ ) 648 { 649 int diff = std::abs(lval - rptr[d]); 650 cbuf[d] = (uchar)diff; 651 hsad[d] = (int)(hsad[d] + diff); 652 } 653 #endif 654 htext[y] += tab[lval]; 655 } 656 } 657 658 // initialize the left and right borders of the disparity map 659 for( y = 0; y < height; y++ ) 660 { 661 for( x = 0; x < lofs; x++ ) 662 dptr[y*dstep + x] = FILTERED; 663 for( x = lofs + width1; x < width; x++ ) 664 dptr[y*dstep + x] = FILTERED; 665 } 666 dptr += lofs; 667 668 for( x = 0; x < width1; x++, dptr++ ) 669 { 670 int* costptr = cost.data ? cost.ptr<int>() + lofs + x : &costbuf; 671 int x0 = x - wsz2 - 1, x1 = x + wsz2; 672 const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp; 673 cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp; 674 hsad = hsad0 - dy0*ndisp; 675 lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep; 676 lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep; 677 rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep; 678 679 for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp, 680 hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep ) 681 { 682 int lval = lptr[0]; 683 #if CV_NEON 684 int16x8_t lv = vdupq_n_s16 ((int16_t)lval); 685 for( d = 0; d < ndisp; d += 8 ) 686 { 687 int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d))); 688 int32x4_t hsad_l = vld1q_s32 (hsad + d); 689 int32x4_t hsad_h = vld1q_s32 (hsad + d + 4); 690 int16x8_t cbs = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (cbuf_sub + d))); 691 int16x8_t diff = vabdq_s16 (lv, rv); 692 int32x4_t diff_h = vsubl_s16 (vget_high_s16 (diff), vget_high_s16 (cbs)); 693 int32x4_t diff_l = vsubl_s16 (vget_low_s16 (diff), vget_low_s16 (cbs)); 694 vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff))); 695 hsad_h = vaddq_s32 (hsad_h, diff_h); 696 hsad_l = vaddq_s32 (hsad_l, diff_l); 697 vst1q_s32 ((hsad + d), hsad_l); 698 vst1q_s32 ((hsad + d + 4), hsad_h); 699 } 700 #else 701 for( d = 0; d < ndisp; d++ ) 702 { 703 int diff = std::abs(lval - rptr[d]); 704 cbuf[d] = (uchar)diff; 705 hsad[d] = hsad[d] + diff - cbuf_sub[d]; 706 } 707 #endif 708 htext[y] += tab[lval] - tab[lptr_sub[0]]; 709 } 710 711 // fill borders 712 for( y = dy1; y <= wsz2; y++ ) 713 htext[height+y] = htext[height+dy1-1]; 714 for( y = -wsz2-1; y < -dy0; y++ ) 715 htext[y] = htext[-dy0]; 716 717 // initialize sums 718 for( d = 0; d < ndisp; d++ ) 719 sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0)); 720 721 hsad = hsad0 + (1 - dy0)*ndisp; 722 for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp ) 723 { 724 #if CV_NEON 725 for( d = 0; d <= ndisp-8; d += 8 ) 726 { 727 int32x4_t s0 = vld1q_s32 (sad + d); 728 int32x4_t s1 = vld1q_s32 (sad + d + 4); 729 int32x4_t t0 = vld1q_s32 (hsad + d); 730 int32x4_t t1 = vld1q_s32 (hsad + d + 4); 731 s0 = vaddq_s32 (s0, t0); 732 s1 = vaddq_s32 (s1, t1); 733 vst1q_s32 (sad + d, s0); 734 vst1q_s32 (sad + d + 4, s1); 735 } 736 #else 737 for( d = 0; d < ndisp; d++ ) 738 sad[d] = (int)(sad[d] + hsad[d]); 739 #endif 740 } 741 int tsum = 0; 742 for( y = -wsz2-1; y < wsz2; y++ ) 743 tsum += htext[y]; 744 745 // finally, start the real processing 746 for( y = 0; y < height; y++ ) 747 { 748 int minsad = INT_MAX, mind = -1; 749 hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp; 750 hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp; 751 #if CV_NEON 752 int32x4_t minsad4 = vdupq_n_s32 (INT_MAX); 753 int32x4_t mind4 = vdupq_n_s32(0), d4 = d0_4; 754 755 for( d = 0; d <= ndisp-8; d += 8 ) 756 { 757 int32x4_t u0 = vld1q_s32 (hsad_sub + d); 758 int32x4_t u1 = vld1q_s32 (hsad + d); 759 760 int32x4_t v0 = vld1q_s32 (hsad_sub + d + 4); 761 int32x4_t v1 = vld1q_s32 (hsad + d + 4); 762 763 int32x4_t usad4 = vld1q_s32(sad + d); 764 int32x4_t vsad4 = vld1q_s32(sad + d + 4); 765 766 u1 = vsubq_s32 (u1, u0); 767 v1 = vsubq_s32 (v1, v0); 768 usad4 = vaddq_s32 (usad4, u1); 769 vsad4 = vaddq_s32 (vsad4, v1); 770 771 uint32x4_t mask = vcgtq_s32 (minsad4, usad4); 772 minsad4 = vminq_s32 (minsad4, usad4); 773 mind4 = vbslq_s32(mask, d4, mind4); 774 775 vst1q_s32 (sad + d, usad4); 776 vst1q_s32 (sad + d + 4, vsad4); 777 d4 = vaddq_s32 (d4, dd_4); 778 779 mask = vcgtq_s32 (minsad4, vsad4); 780 minsad4 = vminq_s32 (minsad4, vsad4); 781 mind4 = vbslq_s32(mask, d4, mind4); 782 783 d4 = vaddq_s32 (d4, dd_4); 784 785 } 786 int32x2_t mind4_h = vget_high_s32 (mind4); 787 int32x2_t mind4_l = vget_low_s32 (mind4); 788 int32x2_t minsad4_h = vget_high_s32 (minsad4); 789 int32x2_t minsad4_l = vget_low_s32 (minsad4); 790 791 uint32x2_t mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l))); 792 mind4_h = vbsl_s32 (mask, mind4_h, mind4_l); 793 minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l); 794 795 mind4_l = vext_s32 (mind4_h,mind4_h,1); 796 minsad4_l = vext_s32 (minsad4_h,minsad4_h,1); 797 798 mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l))); 799 mind4_h = vbsl_s32 (mask, mind4_h, mind4_l); 800 minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l); 801 802 mind = (int) vget_lane_s32 (mind4_h, 0); 803 minsad = sad[mind]; 804 805 #else 806 for( d = 0; d < ndisp; d++ ) 807 { 808 int currsad = sad[d] + hsad[d] - hsad_sub[d]; 809 sad[d] = currsad; 810 if( currsad < minsad ) 811 { 812 minsad = currsad; 813 mind = d; 814 } 815 } 816 #endif 817 818 tsum += htext[y + wsz2] - htext[y - wsz2 - 1]; 819 if( tsum < textureThreshold ) 820 { 821 dptr[y*dstep] = FILTERED; 822 continue; 823 } 824 825 if( uniquenessRatio > 0 ) 826 { 827 int thresh = minsad + (minsad * uniquenessRatio/100); 828 for( d = 0; d < ndisp; d++ ) 829 { 830 if( (d < mind-1 || d > mind+1) && sad[d] <= thresh) 831 break; 832 } 833 if( d < ndisp ) 834 { 835 dptr[y*dstep] = FILTERED; 836 continue; 837 } 838 } 839 840 { 841 sad[-1] = sad[1]; 842 sad[ndisp] = sad[ndisp-2]; 843 int p = sad[mind+1], n = sad[mind-1]; 844 d = p + n - 2*sad[mind] + std::abs(p - n); 845 dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4); 846 costptr[y*coststep] = sad[mind]; 847 } 848 } 849 } 850 } 851 852 static bool ocl_prefiltering(InputArray left0, InputArray right0, OutputArray left, OutputArray right, StereoBMParams* state) 853 { 854 if( state->preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE ) 855 { 856 if(!ocl_prefilter_norm( left0, left, state->preFilterSize, state->preFilterCap)) 857 return false; 858 if(!ocl_prefilter_norm( right0, right, state->preFilterSize, state->preFilterCap)) 859 return false; 860 } 861 else 862 { 863 if(!ocl_prefilter_xsobel( left0, left, state->preFilterCap )) 864 return false; 865 if(!ocl_prefilter_xsobel( right0, right, state->preFilterCap)) 866 return false; 867 } 868 return true; 869 } 870 871 struct PrefilterInvoker : public ParallelLoopBody 872 { 873 PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right, 874 uchar* buf0, uchar* buf1, StereoBMParams* _state) 875 { 876 imgs0[0] = &left0; imgs0[1] = &right0; 877 imgs[0] = &left; imgs[1] = &right; 878 buf[0] = buf0; buf[1] = buf1; 879 state = _state; 880 } 881 882 void operator()( const Range& range ) const 883 { 884 for( int i = range.start; i < range.end; i++ ) 885 { 886 if( state->preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE ) 887 prefilterNorm( *imgs0[i], *imgs[i], state->preFilterSize, state->preFilterCap, buf[i] ); 888 else 889 prefilterXSobel( *imgs0[i], *imgs[i], state->preFilterCap ); 890 } 891 } 892 893 const Mat* imgs0[2]; 894 Mat* imgs[2]; 895 uchar* buf[2]; 896 StereoBMParams* state; 897 }; 898 899 static bool ocl_stereobm( InputArray _left, InputArray _right, 900 OutputArray _disp, StereoBMParams* state) 901 { 902 int ndisp = state->numDisparities; 903 int mindisp = state->minDisparity; 904 int wsz = state->SADWindowSize; 905 int wsz2 = wsz/2; 906 907 ocl::Device devDef = ocl::Device::getDefault(); 908 int sizeX = devDef.isIntel() ? 32 : std::max(11, 27 - devDef.maxComputeUnits()), 909 sizeY = sizeX - 1, 910 N = ndisp * 2; 911 912 cv::String opt = cv::format("-D DEFINE_KERNEL_STEREOBM -D MIN_DISP=%d -D NUM_DISP=%d" 913 " -D BLOCK_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D WSZ=%d", 914 mindisp, ndisp, 915 sizeX, sizeY, wsz); 916 ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, opt); 917 if(k.empty()) 918 return false; 919 920 UMat left = _left.getUMat(), right = _right.getUMat(); 921 int cols = left.cols, rows = left.rows; 922 923 _disp.create(_left.size(), CV_16S); 924 _disp.setTo((mindisp - 1) << 4); 925 Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-wsz2-mindisp, rows-wsz2) ); 926 UMat disp = (_disp.getUMat())(roi); 927 928 int globalX = (disp.cols + sizeX - 1) / sizeX, 929 globalY = (disp.rows + sizeY - 1) / sizeY; 930 size_t globalThreads[3] = {N, globalX, globalY}; 931 size_t localThreads[3] = {N, 1, 1}; 932 933 int idx = 0; 934 idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); 935 idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right)); 936 idx = k.set(idx, ocl::KernelArg::WriteOnlyNoSize(disp)); 937 idx = k.set(idx, rows); 938 idx = k.set(idx, cols); 939 idx = k.set(idx, state->textureThreshold); 940 idx = k.set(idx, state->uniquenessRatio); 941 return k.run(3, globalThreads, localThreads, false); 942 } 943 944 struct FindStereoCorrespInvoker : public ParallelLoopBody 945 { 946 FindStereoCorrespInvoker( const Mat& _left, const Mat& _right, 947 Mat& _disp, StereoBMParams* _state, 948 int _nstripes, size_t _stripeBufSize, 949 bool _useShorts, Rect _validDisparityRect, 950 Mat& _slidingSumBuf, Mat& _cost ) 951 { 952 left = &_left; right = &_right; 953 disp = &_disp; state = _state; 954 nstripes = _nstripes; stripeBufSize = _stripeBufSize; 955 useShorts = _useShorts; 956 validDisparityRect = _validDisparityRect; 957 slidingSumBuf = &_slidingSumBuf; 958 cost = &_cost; 959 } 960 961 void operator()( const Range& range ) const 962 { 963 int cols = left->cols, rows = left->rows; 964 int _row0 = std::min(cvRound(range.start * rows / nstripes), rows); 965 int _row1 = std::min(cvRound(range.end * rows / nstripes), rows); 966 uchar *ptr = slidingSumBuf->ptr() + range.start * stripeBufSize; 967 int FILTERED = (state->minDisparity - 1)*16; 968 969 Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0); 970 if( roi.height == 0 ) 971 return; 972 int row0 = roi.y; 973 int row1 = roi.y + roi.height; 974 975 Mat part; 976 if( row0 > _row0 ) 977 { 978 part = disp->rowRange(_row0, row0); 979 part = Scalar::all(FILTERED); 980 } 981 if( _row1 > row1 ) 982 { 983 part = disp->rowRange(row1, _row1); 984 part = Scalar::all(FILTERED); 985 } 986 987 Mat left_i = left->rowRange(row0, row1); 988 Mat right_i = right->rowRange(row0, row1); 989 Mat disp_i = disp->rowRange(row0, row1); 990 Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat(); 991 992 #if CV_SSE2 993 if( useShorts ) 994 findStereoCorrespondenceBM_SSE2( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 ); 995 else 996 #endif 997 findStereoCorrespondenceBM( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 ); 998 999 if( state->disp12MaxDiff >= 0 ) 1000 validateDisparity( disp_i, cost_i, state->minDisparity, state->numDisparities, state->disp12MaxDiff ); 1001 1002 if( roi.x > 0 ) 1003 { 1004 part = disp_i.colRange(0, roi.x); 1005 part = Scalar::all(FILTERED); 1006 } 1007 if( roi.x + roi.width < cols ) 1008 { 1009 part = disp_i.colRange(roi.x + roi.width, cols); 1010 part = Scalar::all(FILTERED); 1011 } 1012 } 1013 1014 protected: 1015 const Mat *left, *right; 1016 Mat* disp, *slidingSumBuf, *cost; 1017 StereoBMParams *state; 1018 1019 int nstripes; 1020 size_t stripeBufSize; 1021 bool useShorts; 1022 Rect validDisparityRect; 1023 }; 1024 1025 class StereoBMImpl : public StereoBM 1026 { 1027 public: 1028 StereoBMImpl() 1029 { 1030 params = StereoBMParams(); 1031 } 1032 1033 StereoBMImpl( int _numDisparities, int _SADWindowSize ) 1034 { 1035 params = StereoBMParams(_numDisparities, _SADWindowSize); 1036 } 1037 1038 void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) 1039 { 1040 int dtype = disparr.fixedType() ? disparr.type() : params.dispType; 1041 Size leftsize = leftarr.size(); 1042 1043 if (leftarr.size() != rightarr.size()) 1044 CV_Error( Error::StsUnmatchedSizes, "All the images must have the same size" ); 1045 1046 if (leftarr.type() != CV_8UC1 || rightarr.type() != CV_8UC1) 1047 CV_Error( Error::StsUnsupportedFormat, "Both input images must have CV_8UC1" ); 1048 1049 if (dtype != CV_16SC1 && dtype != CV_32FC1) 1050 CV_Error( Error::StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format" ); 1051 1052 if( params.preFilterType != PREFILTER_NORMALIZED_RESPONSE && 1053 params.preFilterType != PREFILTER_XSOBEL ) 1054 CV_Error( Error::StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE" ); 1055 1056 if( params.preFilterSize < 5 || params.preFilterSize > 255 || params.preFilterSize % 2 == 0 ) 1057 CV_Error( Error::StsOutOfRange, "preFilterSize must be odd and be within 5..255" ); 1058 1059 if( params.preFilterCap < 1 || params.preFilterCap > 63 ) 1060 CV_Error( Error::StsOutOfRange, "preFilterCap must be within 1..63" ); 1061 1062 if( params.SADWindowSize < 5 || params.SADWindowSize > 255 || params.SADWindowSize % 2 == 0 || 1063 params.SADWindowSize >= std::min(leftsize.width, leftsize.height) ) 1064 CV_Error( Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height" ); 1065 1066 if( params.numDisparities <= 0 || params.numDisparities % 16 != 0 ) 1067 CV_Error( Error::StsOutOfRange, "numDisparities must be positive and divisble by 16" ); 1068 1069 if( params.textureThreshold < 0 ) 1070 CV_Error( Error::StsOutOfRange, "texture threshold must be non-negative" ); 1071 1072 if( params.uniquenessRatio < 0 ) 1073 CV_Error( Error::StsOutOfRange, "uniqueness ratio must be non-negative" ); 1074 1075 int FILTERED = (params.minDisparity - 1) << DISPARITY_SHIFT; 1076 1077 if(ocl::useOpenCL() && disparr.isUMat() && params.textureThreshold == 0) 1078 { 1079 UMat left, right; 1080 if(ocl_prefiltering(leftarr, rightarr, left, right, ¶ms)) 1081 { 1082 if(ocl_stereobm(left, right, disparr, ¶ms)) 1083 { 1084 if( params.speckleRange >= 0 && params.speckleWindowSize > 0 ) 1085 filterSpeckles(disparr.getMat(), FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf); 1086 if (dtype == CV_32F) 1087 disparr.getUMat().convertTo(disparr, CV_32FC1, 1./(1 << DISPARITY_SHIFT), 0); 1088 CV_IMPL_ADD(CV_IMPL_OCL); 1089 return; 1090 } 1091 } 1092 } 1093 1094 Mat left0 = leftarr.getMat(), right0 = rightarr.getMat(); 1095 disparr.create(left0.size(), dtype); 1096 Mat disp0 = disparr.getMat(); 1097 1098 preFilteredImg0.create( left0.size(), CV_8U ); 1099 preFilteredImg1.create( left0.size(), CV_8U ); 1100 cost.create( left0.size(), CV_16S ); 1101 1102 Mat left = preFilteredImg0, right = preFilteredImg1; 1103 1104 int mindisp = params.minDisparity; 1105 int ndisp = params.numDisparities; 1106 1107 int width = left0.cols; 1108 int height = left0.rows; 1109 int lofs = std::max(ndisp - 1 + mindisp, 0); 1110 int rofs = -std::min(ndisp - 1 + mindisp, 0); 1111 int width1 = width - rofs - ndisp + 1; 1112 1113 if( lofs >= width || rofs >= width || width1 < 1 ) 1114 { 1115 disp0 = Scalar::all( FILTERED * ( disp0.type() < CV_32F ? 1 : 1./(1 << DISPARITY_SHIFT) ) ); 1116 return; 1117 } 1118 1119 Mat disp = disp0; 1120 if( dtype == CV_32F ) 1121 { 1122 dispbuf.create(disp0.size(), CV_16S); 1123 disp = dispbuf; 1124 } 1125 1126 int wsz = params.SADWindowSize; 1127 int bufSize0 = (int)((ndisp + 2)*sizeof(int)); 1128 bufSize0 += (int)((height+wsz+2)*ndisp*sizeof(int)); 1129 bufSize0 += (int)((height + wsz + 2)*sizeof(int)); 1130 bufSize0 += (int)((height+wsz+2)*ndisp*(wsz+2)*sizeof(uchar) + 256); 1131 1132 int bufSize1 = (int)((width + params.preFilterSize + 2) * sizeof(int) + 256); 1133 int bufSize2 = 0; 1134 if( params.speckleRange >= 0 && params.speckleWindowSize > 0 ) 1135 bufSize2 = width*height*(sizeof(Point_<short>) + sizeof(int) + sizeof(uchar)); 1136 1137 #if CV_SSE2 1138 bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2); 1139 #else 1140 const bool useShorts = false; 1141 #endif 1142 1143 const double SAD_overhead_coeff = 10.0; 1144 double N0 = 8000000 / (useShorts ? 1 : 4); // approx tbb's min number instructions reasonable for one thread 1145 double maxStripeSize = std::min(std::max(N0 / (width * ndisp), (wsz-1) * SAD_overhead_coeff), (double)height); 1146 int nstripes = cvCeil(height / maxStripeSize); 1147 int bufSize = std::max(bufSize0 * nstripes, std::max(bufSize1 * 2, bufSize2)); 1148 1149 if( slidingSumBuf.cols < bufSize ) 1150 slidingSumBuf.create( 1, bufSize, CV_8U ); 1151 1152 uchar *_buf = slidingSumBuf.ptr(); 1153 1154 parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, ¶ms), 1); 1155 1156 Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2; 1157 validDisparityRect = getValidDisparityROI(R1.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect, 1158 R2.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect, 1159 params.minDisparity, params.numDisparities, 1160 params.SADWindowSize); 1161 1162 parallel_for_(Range(0, nstripes), 1163 FindStereoCorrespInvoker(left, right, disp, ¶ms, nstripes, 1164 bufSize0, useShorts, validDisparityRect, 1165 slidingSumBuf, cost)); 1166 1167 if( params.speckleRange >= 0 && params.speckleWindowSize > 0 ) 1168 filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf); 1169 1170 if (disp0.data != disp.data) 1171 disp.convertTo(disp0, disp0.type(), 1./(1 << DISPARITY_SHIFT), 0); 1172 } 1173 1174 int getMinDisparity() const { return params.minDisparity; } 1175 void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; } 1176 1177 int getNumDisparities() const { return params.numDisparities; } 1178 void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; } 1179 1180 int getBlockSize() const { return params.SADWindowSize; } 1181 void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; } 1182 1183 int getSpeckleWindowSize() const { return params.speckleWindowSize; } 1184 void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; } 1185 1186 int getSpeckleRange() const { return params.speckleRange; } 1187 void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; } 1188 1189 int getDisp12MaxDiff() const { return params.disp12MaxDiff; } 1190 void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; } 1191 1192 int getPreFilterType() const { return params.preFilterType; } 1193 void setPreFilterType(int preFilterType) { params.preFilterType = preFilterType; } 1194 1195 int getPreFilterSize() const { return params.preFilterSize; } 1196 void setPreFilterSize(int preFilterSize) { params.preFilterSize = preFilterSize; } 1197 1198 int getPreFilterCap() const { return params.preFilterCap; } 1199 void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; } 1200 1201 int getTextureThreshold() const { return params.textureThreshold; } 1202 void setTextureThreshold(int textureThreshold) { params.textureThreshold = textureThreshold; } 1203 1204 int getUniquenessRatio() const { return params.uniquenessRatio; } 1205 void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; } 1206 1207 int getSmallerBlockSize() const { return 0; } 1208 void setSmallerBlockSize(int) {} 1209 1210 Rect getROI1() const { return params.roi1; } 1211 void setROI1(Rect roi1) { params.roi1 = roi1; } 1212 1213 Rect getROI2() const { return params.roi2; } 1214 void setROI2(Rect roi2) { params.roi2 = roi2; } 1215 1216 void write(FileStorage& fs) const 1217 { 1218 fs << "name" << name_ 1219 << "minDisparity" << params.minDisparity 1220 << "numDisparities" << params.numDisparities 1221 << "blockSize" << params.SADWindowSize 1222 << "speckleWindowSize" << params.speckleWindowSize 1223 << "speckleRange" << params.speckleRange 1224 << "disp12MaxDiff" << params.disp12MaxDiff 1225 << "preFilterType" << params.preFilterType 1226 << "preFilterSize" << params.preFilterSize 1227 << "preFilterCap" << params.preFilterCap 1228 << "textureThreshold" << params.textureThreshold 1229 << "uniquenessRatio" << params.uniquenessRatio; 1230 } 1231 1232 void read(const FileNode& fn) 1233 { 1234 FileNode n = fn["name"]; 1235 CV_Assert( n.isString() && String(n) == name_ ); 1236 params.minDisparity = (int)fn["minDisparity"]; 1237 params.numDisparities = (int)fn["numDisparities"]; 1238 params.SADWindowSize = (int)fn["blockSize"]; 1239 params.speckleWindowSize = (int)fn["speckleWindowSize"]; 1240 params.speckleRange = (int)fn["speckleRange"]; 1241 params.disp12MaxDiff = (int)fn["disp12MaxDiff"]; 1242 params.preFilterType = (int)fn["preFilterType"]; 1243 params.preFilterSize = (int)fn["preFilterSize"]; 1244 params.preFilterCap = (int)fn["preFilterCap"]; 1245 params.textureThreshold = (int)fn["textureThreshold"]; 1246 params.uniquenessRatio = (int)fn["uniquenessRatio"]; 1247 params.roi1 = params.roi2 = Rect(); 1248 } 1249 1250 StereoBMParams params; 1251 Mat preFilteredImg0, preFilteredImg1, cost, dispbuf; 1252 Mat slidingSumBuf; 1253 1254 static const char* name_; 1255 }; 1256 1257 const char* StereoBMImpl::name_ = "StereoMatcher.BM"; 1258 1259 Ptr<StereoBM> StereoBM::create(int _numDisparities, int _SADWindowSize) 1260 { 1261 return makePtr<StereoBMImpl>(_numDisparities, _SADWindowSize); 1262 } 1263 1264 } 1265 1266 /* End of file. */ 1267