Home | History | Annotate | Download | only in src
      1 //M*//////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                          License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 /****************************************************************************************\
     44 *    Very fast SAD-based (Sum-of-Absolute-Diffrences) stereo correspondence algorithm.   *
     45 *    Contributed by Kurt Konolige                                                        *
     46 \****************************************************************************************/
     47 
     48 #include "precomp.hpp"
     49 #include <stdio.h>
     50 #include <limits>
     51 #include "opencl_kernels_calib3d.hpp"
     52 
     53 namespace cv
     54 {
     55 
     56 struct StereoBMParams
     57 {
     58     StereoBMParams(int _numDisparities=64, int _SADWindowSize=21)
     59     {
     60         preFilterType = StereoBM::PREFILTER_XSOBEL;
     61         preFilterSize = 9;
     62         preFilterCap = 31;
     63         SADWindowSize = _SADWindowSize;
     64         minDisparity = 0;
     65         numDisparities = _numDisparities > 0 ? _numDisparities : 64;
     66         textureThreshold = 10;
     67         uniquenessRatio = 15;
     68         speckleRange = speckleWindowSize = 0;
     69         roi1 = roi2 = Rect(0,0,0,0);
     70         disp12MaxDiff = -1;
     71         dispType = CV_16S;
     72     }
     73 
     74     int preFilterType;
     75     int preFilterSize;
     76     int preFilterCap;
     77     int SADWindowSize;
     78     int minDisparity;
     79     int numDisparities;
     80     int textureThreshold;
     81     int uniquenessRatio;
     82     int speckleRange;
     83     int speckleWindowSize;
     84     Rect roi1, roi2;
     85     int disp12MaxDiff;
     86     int dispType;
     87 };
     88 
     89 static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsize, int prefilterCap)
     90 {
     91     ocl::Kernel k("prefilter_norm", ocl::calib3d::stereobm_oclsrc, cv::format("-D WSZ=%d", winsize));
     92     if(k.empty())
     93         return false;
     94 
     95     int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
     96     scale_g *= scale_s;
     97 
     98     UMat input = _input.getUMat(), output;
     99     _output.create(input.size(), input.type());
    100     output = _output.getUMat();
    101 
    102     size_t globalThreads[3] = { input.cols, input.rows, 1 };
    103 
    104     k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols,
    105         prefilterCap, scale_g, scale_s);
    106 
    107     return k.run(2, globalThreads, NULL, false);
    108 }
    109 
    110 static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf )
    111 {
    112     int x, y, wsz2 = winsize/2;
    113     int* vsum = (int*)alignPtr(buf + (wsz2 + 1)*sizeof(vsum[0]), 32);
    114     int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
    115     const int OFS = 256*5, TABSZ = OFS*2 + 256;
    116     uchar tab[TABSZ];
    117     const uchar* sptr = src.ptr();
    118     int srcstep = (int)src.step;
    119     Size size = src.size();
    120 
    121     scale_g *= scale_s;
    122 
    123     for( x = 0; x < TABSZ; x++ )
    124         tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
    125 
    126     for( x = 0; x < size.width; x++ )
    127         vsum[x] = (ushort)(sptr[x]*(wsz2 + 2));
    128 
    129     for( y = 1; y < wsz2; y++ )
    130     {
    131         for( x = 0; x < size.width; x++ )
    132             vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]);
    133     }
    134 
    135     for( y = 0; y < size.height; y++ )
    136     {
    137         const uchar* top = sptr + srcstep*MAX(y-wsz2-1,0);
    138         const uchar* bottom = sptr + srcstep*MIN(y+wsz2,size.height-1);
    139         const uchar* prev = sptr + srcstep*MAX(y-1,0);
    140         const uchar* curr = sptr + srcstep*y;
    141         const uchar* next = sptr + srcstep*MIN(y+1,size.height-1);
    142         uchar* dptr = dst.ptr<uchar>(y);
    143 
    144         for( x = 0; x < size.width; x++ )
    145             vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]);
    146 
    147         for( x = 0; x <= wsz2; x++ )
    148         {
    149             vsum[-x-1] = vsum[0];
    150             vsum[size.width+x] = vsum[size.width-1];
    151         }
    152 
    153         int sum = vsum[0]*(wsz2 + 1);
    154         for( x = 1; x <= wsz2; x++ )
    155             sum += vsum[x];
    156 
    157         int val = ((curr[0]*5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10;
    158         dptr[0] = tab[val + OFS];
    159 
    160         for( x = 1; x < size.width-1; x++ )
    161         {
    162             sum += vsum[x+wsz2] - vsum[x-wsz2-1];
    163             val = ((curr[x]*4 + curr[x-1] + curr[x+1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
    164             dptr[x] = tab[val + OFS];
    165         }
    166 
    167         sum += vsum[x+wsz2] - vsum[x-wsz2-1];
    168         val = ((curr[x]*5 + curr[x-1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
    169         dptr[x] = tab[val + OFS];
    170     }
    171 }
    172 
    173 static bool ocl_prefilter_xsobel(InputArray _input, OutputArray _output, int prefilterCap)
    174 {
    175     ocl::Kernel k("prefilter_xsobel", ocl::calib3d::stereobm_oclsrc);
    176     if(k.empty())
    177         return false;
    178 
    179     UMat input = _input.getUMat(), output;
    180     _output.create(input.size(), input.type());
    181     output = _output.getUMat();
    182 
    183     size_t globalThreads[3] = { input.cols, input.rows, 1 };
    184 
    185     k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols, prefilterCap);
    186 
    187     return k.run(2, globalThreads, NULL, false);
    188 }
    189 
    190 static void
    191 prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
    192 {
    193     int x, y;
    194     const int OFS = 256*4, TABSZ = OFS*2 + 256;
    195     uchar tab[TABSZ];
    196     Size size = src.size();
    197 
    198     for( x = 0; x < TABSZ; x++ )
    199         tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
    200     uchar val0 = tab[0 + OFS];
    201 
    202 #if CV_SSE2
    203     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
    204 #endif
    205 
    206     for( y = 0; y < size.height-1; y += 2 )
    207     {
    208         const uchar* srow1 = src.ptr<uchar>(y);
    209         const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1;
    210         const uchar* srow2 = y < size.height-1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1;
    211         const uchar* srow3 = y < size.height-2 ? srow1 + src.step*2 : srow1;
    212         uchar* dptr0 = dst.ptr<uchar>(y);
    213         uchar* dptr1 = dptr0 + dst.step;
    214 
    215         dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
    216         x = 1;
    217 
    218 #if CV_NEON
    219         int16x8_t ftz = vdupq_n_s16 ((short) ftzero);
    220         uint8x8_t ftz2 = vdup_n_u8 (cv::saturate_cast<uchar>(ftzero*2));
    221 
    222         for(; x <=size.width-9; x += 8 )
    223         {
    224             uint8x8_t c0 = vld1_u8 (srow0 + x - 1);
    225             uint8x8_t c1 = vld1_u8 (srow1 + x - 1);
    226             uint8x8_t d0 = vld1_u8 (srow0 + x + 1);
    227             uint8x8_t d1 = vld1_u8 (srow1 + x + 1);
    228 
    229             int16x8_t t0 = vreinterpretq_s16_u16 (vsubl_u8 (d0, c0));
    230             int16x8_t t1 = vreinterpretq_s16_u16 (vsubl_u8 (d1, c1));
    231 
    232             uint8x8_t c2 = vld1_u8 (srow2 + x - 1);
    233             uint8x8_t c3 = vld1_u8 (srow3 + x - 1);
    234             uint8x8_t d2 = vld1_u8 (srow2 + x + 1);
    235             uint8x8_t d3 = vld1_u8 (srow3 + x + 1);
    236 
    237             int16x8_t t2 = vreinterpretq_s16_u16 (vsubl_u8 (d2, c2));
    238             int16x8_t t3 = vreinterpretq_s16_u16 (vsubl_u8 (d3, c3));
    239 
    240             int16x8_t v0 = vaddq_s16 (vaddq_s16 (t2, t0), vaddq_s16 (t1, t1));
    241             int16x8_t v1 = vaddq_s16 (vaddq_s16 (t3, t1), vaddq_s16 (t2, t2));
    242 
    243 
    244             uint8x8_t v0_u8 = vqmovun_s16 (vaddq_s16 (v0, ftz));
    245             uint8x8_t v1_u8 = vqmovun_s16 (vaddq_s16 (v1, ftz));
    246             v0_u8 =  vmin_u8 (v0_u8, ftz2);
    247             v1_u8 =  vmin_u8 (v1_u8, ftz2);
    248             vqmovun_s16 (vaddq_s16 (v1, ftz));
    249 
    250             vst1_u8 (dptr0 + x, v0_u8);
    251             vst1_u8 (dptr1 + x, v1_u8);
    252         }
    253 #elif CV_SSE2
    254         if( useSIMD )
    255         {
    256             __m128i z = _mm_setzero_si128(), ftz = _mm_set1_epi16((short)ftzero),
    257             ftz2 = _mm_set1_epi8(cv::saturate_cast<uchar>(ftzero*2));
    258             for( ; x <= size.width-9; x += 8 )
    259             {
    260                 __m128i c0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x - 1)), z);
    261                 __m128i c1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x - 1)), z);
    262                 __m128i d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x + 1)), z);
    263                 __m128i d1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x + 1)), z);
    264 
    265                 d0 = _mm_sub_epi16(d0, c0);
    266                 d1 = _mm_sub_epi16(d1, c1);
    267 
    268                 __m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
    269                 __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z);
    270                 __m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
    271                 __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z);
    272 
    273                 d2 = _mm_sub_epi16(d2, c2);
    274                 d3 = _mm_sub_epi16(d3, c3);
    275 
    276                 __m128i v0 = _mm_add_epi16(d0, _mm_add_epi16(d2, _mm_add_epi16(d1, d1)));
    277                 __m128i v1 = _mm_add_epi16(d1, _mm_add_epi16(d3, _mm_add_epi16(d2, d2)));
    278                 v0 = _mm_packus_epi16(_mm_add_epi16(v0, ftz), _mm_add_epi16(v1, ftz));
    279                 v0 = _mm_min_epu8(v0, ftz2);
    280 
    281                 _mm_storel_epi64((__m128i*)(dptr0 + x), v0);
    282                 _mm_storel_epi64((__m128i*)(dptr1 + x), _mm_unpackhi_epi64(v0, v0));
    283             }
    284         }
    285 #endif
    286 
    287         for( ; x < size.width-1; x++ )
    288         {
    289             int d0 = srow0[x+1] - srow0[x-1], d1 = srow1[x+1] - srow1[x-1],
    290             d2 = srow2[x+1] - srow2[x-1], d3 = srow3[x+1] - srow3[x-1];
    291             int v0 = tab[d0 + d1*2 + d2 + OFS];
    292             int v1 = tab[d1 + d2*2 + d3 + OFS];
    293             dptr0[x] = (uchar)v0;
    294             dptr1[x] = (uchar)v1;
    295         }
    296     }
    297 
    298 #if CV_NEON
    299     uint8x16_t val0_16 = vdupq_n_u8 (val0);
    300 #endif
    301 
    302     for( ; y < size.height; y++ )
    303     {
    304         uchar* dptr = dst.ptr<uchar>(y);
    305         x = 0;
    306     #if CV_NEON
    307         for(; x <= size.width-16; x+=16 )
    308             vst1q_u8 (dptr + x, val0_16);
    309     #endif
    310         for(; x < size.width; x++ )
    311             dptr[x] = val0;
    312     }
    313 }
    314 
    315 
    316 static const int DISPARITY_SHIFT = 4;
    317 
    318 #if CV_SSE2
    319 static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
    320                                             Mat& disp, Mat& cost, StereoBMParams& state,
    321                                             uchar* buf, int _dy0, int _dy1 )
    322 {
    323     const int ALIGN = 16;
    324     int x, y, d;
    325     int wsz = state.SADWindowSize, wsz2 = wsz/2;
    326     int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
    327     int ndisp = state.numDisparities;
    328     int mindisp = state.minDisparity;
    329     int lofs = MAX(ndisp - 1 + mindisp, 0);
    330     int rofs = -MIN(ndisp - 1 + mindisp, 0);
    331     int width = left.cols, height = left.rows;
    332     int width1 = width - rofs - ndisp + 1;
    333     int ftzero = state.preFilterCap;
    334     int textureThreshold = state.textureThreshold;
    335     int uniquenessRatio = state.uniquenessRatio;
    336     short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
    337 
    338     ushort *sad, *hsad0, *hsad, *hsad_sub;
    339     int *htext;
    340     uchar *cbuf0, *cbuf;
    341     const uchar* lptr0 = left.ptr() + lofs;
    342     const uchar* rptr0 = right.ptr() + rofs;
    343     const uchar *lptr, *lptr_sub, *rptr;
    344     short* dptr = disp.ptr<short>();
    345     int sstep = (int)left.step;
    346     int dstep = (int)(disp.step/sizeof(dptr[0]));
    347     int cstep = (height + dy0 + dy1)*ndisp;
    348     short costbuf = 0;
    349     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    350     const int TABSZ = 256;
    351     uchar tab[TABSZ];
    352     const __m128i d0_8 = _mm_setr_epi16(0,1,2,3,4,5,6,7), dd_8 = _mm_set1_epi16(8);
    353 
    354     sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
    355     hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
    356     htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
    357     cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
    358 
    359     for( x = 0; x < TABSZ; x++ )
    360         tab[x] = (uchar)std::abs(x - ftzero);
    361 
    362     // initialize buffers
    363     memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
    364     memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
    365 
    366     for( x = -wsz2-1; x < wsz2; x++ )
    367     {
    368         hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
    369         lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep;
    370         rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-1) - dy0*sstep;
    371 
    372         for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
    373         {
    374             int lval = lptr[0];
    375             __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
    376             for( d = 0; d < ndisp; d += 16 )
    377             {
    378                 __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
    379                 __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
    380                 __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
    381                 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
    382                 _mm_store_si128((__m128i*)(cbuf + d), diff);
    383                 hsad_l = _mm_add_epi16(hsad_l, _mm_unpacklo_epi8(diff,z));
    384                 hsad_h = _mm_add_epi16(hsad_h, _mm_unpackhi_epi8(diff,z));
    385                 _mm_store_si128((__m128i*)(hsad + d), hsad_l);
    386                 _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
    387             }
    388             htext[y] += tab[lval];
    389         }
    390     }
    391 
    392     // initialize the left and right borders of the disparity map
    393     for( y = 0; y < height; y++ )
    394     {
    395         for( x = 0; x < lofs; x++ )
    396             dptr[y*dstep + x] = FILTERED;
    397         for( x = lofs + width1; x < width; x++ )
    398             dptr[y*dstep + x] = FILTERED;
    399     }
    400     dptr += lofs;
    401 
    402     for( x = 0; x < width1; x++, dptr++ )
    403     {
    404         short* costptr = cost.data ? cost.ptr<short>() + lofs + x : &costbuf;
    405         int x0 = x - wsz2 - 1, x1 = x + wsz2;
    406         const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
    407         cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
    408         hsad = hsad0 - dy0*ndisp;
    409         lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
    410         lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
    411         rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep;
    412 
    413         for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
    414             hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
    415         {
    416             int lval = lptr[0];
    417             __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
    418             for( d = 0; d < ndisp; d += 16 )
    419             {
    420                 __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
    421                 __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
    422                 __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
    423                 __m128i cbs = _mm_load_si128((const __m128i*)(cbuf_sub + d));
    424                 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
    425                 __m128i diff_h = _mm_sub_epi16(_mm_unpackhi_epi8(diff, z), _mm_unpackhi_epi8(cbs, z));
    426                 _mm_store_si128((__m128i*)(cbuf + d), diff);
    427                 diff = _mm_sub_epi16(_mm_unpacklo_epi8(diff, z), _mm_unpacklo_epi8(cbs, z));
    428                 hsad_h = _mm_add_epi16(hsad_h, diff_h);
    429                 hsad_l = _mm_add_epi16(hsad_l, diff);
    430                 _mm_store_si128((__m128i*)(hsad + d), hsad_l);
    431                 _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
    432             }
    433             htext[y] += tab[lval] - tab[lptr_sub[0]];
    434         }
    435 
    436         // fill borders
    437         for( y = dy1; y <= wsz2; y++ )
    438             htext[height+y] = htext[height+dy1-1];
    439         for( y = -wsz2-1; y < -dy0; y++ )
    440             htext[y] = htext[-dy0];
    441 
    442         // initialize sums
    443         for( d = 0; d < ndisp; d++ )
    444             sad[d] = (ushort)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
    445 
    446         hsad = hsad0 + (1 - dy0)*ndisp;
    447         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
    448             for( d = 0; d < ndisp; d += 16 )
    449             {
    450                 __m128i s0 = _mm_load_si128((__m128i*)(sad + d));
    451                 __m128i s1 = _mm_load_si128((__m128i*)(sad + d + 8));
    452                 __m128i t0 = _mm_load_si128((__m128i*)(hsad + d));
    453                 __m128i t1 = _mm_load_si128((__m128i*)(hsad + d + 8));
    454                 s0 = _mm_add_epi16(s0, t0);
    455                 s1 = _mm_add_epi16(s1, t1);
    456                 _mm_store_si128((__m128i*)(sad + d), s0);
    457                 _mm_store_si128((__m128i*)(sad + d + 8), s1);
    458             }
    459         int tsum = 0;
    460         for( y = -wsz2-1; y < wsz2; y++ )
    461             tsum += htext[y];
    462 
    463         // finally, start the real processing
    464         for( y = 0; y < height; y++ )
    465         {
    466             int minsad = INT_MAX, mind = -1;
    467             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
    468             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
    469             __m128i minsad8 = _mm_set1_epi16(SHRT_MAX);
    470             __m128i mind8 = _mm_set1_epi16(0), d8 = d0_8, mask;
    471 
    472             for( d = 0; d < ndisp; d += 16 )
    473             {
    474                 __m128i u0 = _mm_load_si128((__m128i*)(hsad_sub + d));
    475                 __m128i u1 = _mm_load_si128((__m128i*)(hsad + d));
    476 
    477                 __m128i v0 = _mm_load_si128((__m128i*)(hsad_sub + d + 8));
    478                 __m128i v1 = _mm_load_si128((__m128i*)(hsad + d + 8));
    479 
    480                 __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
    481                 __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
    482 
    483                 u1 = _mm_sub_epi16(u1, u0);
    484                 v1 = _mm_sub_epi16(v1, v0);
    485                 usad8 = _mm_add_epi16(usad8, u1);
    486                 vsad8 = _mm_add_epi16(vsad8, v1);
    487 
    488                 mask = _mm_cmpgt_epi16(minsad8, usad8);
    489                 minsad8 = _mm_min_epi16(minsad8, usad8);
    490                 mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
    491 
    492                 _mm_store_si128((__m128i*)(sad + d), usad8);
    493                 _mm_store_si128((__m128i*)(sad + d + 8), vsad8);
    494 
    495                 mask = _mm_cmpgt_epi16(minsad8, vsad8);
    496                 minsad8 = _mm_min_epi16(minsad8, vsad8);
    497 
    498                 d8 = _mm_add_epi16(d8, dd_8);
    499                 mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
    500                 d8 = _mm_add_epi16(d8, dd_8);
    501             }
    502 
    503             tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
    504             if( tsum < textureThreshold )
    505             {
    506                 dptr[y*dstep] = FILTERED;
    507                 continue;
    508             }
    509 
    510             ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
    511             _mm_store_si128((__m128i*)minsad_buf, minsad8);
    512             _mm_store_si128((__m128i*)mind_buf, mind8);
    513             for( d = 0; d < 8; d++ )
    514                 if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
    515                 {
    516                     minsad = minsad_buf[d];
    517                     mind = mind_buf[d];
    518                 }
    519 
    520             if( uniquenessRatio > 0 )
    521             {
    522                 int thresh = minsad + (minsad * uniquenessRatio/100);
    523                 __m128i thresh8 = _mm_set1_epi16((short)(thresh + 1));
    524                 __m128i d1 = _mm_set1_epi16((short)(mind-1)), d2 = _mm_set1_epi16((short)(mind+1));
    525                 __m128i dd_16 = _mm_add_epi16(dd_8, dd_8);
    526                 d8 = _mm_sub_epi16(d0_8, dd_16);
    527 
    528                 for( d = 0; d < ndisp; d += 16 )
    529                 {
    530                     __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
    531                     __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
    532                     mask = _mm_cmpgt_epi16( thresh8, _mm_min_epi16(usad8,vsad8));
    533                     d8 = _mm_add_epi16(d8, dd_16);
    534                     if( !_mm_movemask_epi8(mask) )
    535                         continue;
    536                     mask = _mm_cmpgt_epi16( thresh8, usad8);
    537                     mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,d8), _mm_cmpgt_epi16(d8,d2)));
    538                     if( _mm_movemask_epi8(mask) )
    539                         break;
    540                     __m128i t8 = _mm_add_epi16(d8, dd_8);
    541                     mask = _mm_cmpgt_epi16( thresh8, vsad8);
    542                     mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,t8), _mm_cmpgt_epi16(t8,d2)));
    543                     if( _mm_movemask_epi8(mask) )
    544                         break;
    545                 }
    546                 if( d < ndisp )
    547                 {
    548                     dptr[y*dstep] = FILTERED;
    549                     continue;
    550                 }
    551             }
    552 
    553             if( 0 < mind && mind < ndisp - 1 )
    554             {
    555                 int p = sad[mind+1], n = sad[mind-1];
    556                 d = p + n - 2*sad[mind] + std::abs(p - n);
    557                 dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
    558             }
    559             else
    560                 dptr[y*dstep] = (short)((ndisp - mind - 1 + mindisp)*16);
    561             costptr[y*coststep] = sad[mind];
    562         }
    563     }
    564 }
    565 #endif
    566 
    567 static void
    568 findStereoCorrespondenceBM( const Mat& left, const Mat& right,
    569                            Mat& disp, Mat& cost, const StereoBMParams& state,
    570                            uchar* buf, int _dy0, int _dy1 )
    571 {
    572 
    573     const int ALIGN = 16;
    574     int x, y, d;
    575     int wsz = state.SADWindowSize, wsz2 = wsz/2;
    576     int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
    577     int ndisp = state.numDisparities;
    578     int mindisp = state.minDisparity;
    579     int lofs = MAX(ndisp - 1 + mindisp, 0);
    580     int rofs = -MIN(ndisp - 1 + mindisp, 0);
    581     int width = left.cols, height = left.rows;
    582     int width1 = width - rofs - ndisp + 1;
    583     int ftzero = state.preFilterCap;
    584     int textureThreshold = state.textureThreshold;
    585     int uniquenessRatio = state.uniquenessRatio;
    586     short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
    587 
    588 #if CV_NEON
    589     CV_Assert (ndisp % 8 == 0);
    590     int32_t d0_4_temp [4];
    591     for (int i = 0; i < 4; i ++)
    592         d0_4_temp[i] = i;
    593     int32x4_t d0_4 = vld1q_s32 (d0_4_temp);
    594     int32x4_t dd_4 = vdupq_n_s32 (4);
    595 #endif
    596 
    597     int *sad, *hsad0, *hsad, *hsad_sub, *htext;
    598     uchar *cbuf0, *cbuf;
    599     const uchar* lptr0 = left.ptr() + lofs;
    600     const uchar* rptr0 = right.ptr() + rofs;
    601     const uchar *lptr, *lptr_sub, *rptr;
    602     short* dptr = disp.ptr<short>();
    603     int sstep = (int)left.step;
    604     int dstep = (int)(disp.step/sizeof(dptr[0]));
    605     int cstep = (height+dy0+dy1)*ndisp;
    606     int costbuf = 0;
    607     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    608     const int TABSZ = 256;
    609     uchar tab[TABSZ];
    610 
    611     sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
    612     hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
    613     htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
    614     cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
    615 
    616     for( x = 0; x < TABSZ; x++ )
    617         tab[x] = (uchar)std::abs(x - ftzero);
    618 
    619     // initialize buffers
    620     memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
    621     memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
    622 
    623     for( x = -wsz2-1; x < wsz2; x++ )
    624     {
    625         hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
    626         lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep;
    627         rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-1) - dy0*sstep;
    628         for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
    629         {
    630             int lval = lptr[0];
    631         #if CV_NEON
    632             int16x8_t lv = vdupq_n_s16 ((int16_t)lval);
    633 
    634             for( d = 0; d < ndisp; d += 8 )
    635             {
    636                 int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d)));
    637                 int32x4_t hsad_l = vld1q_s32 (hsad + d);
    638                 int32x4_t hsad_h = vld1q_s32 (hsad + d + 4);
    639                 int16x8_t diff = vabdq_s16 (lv, rv);
    640                 vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff)));
    641                 hsad_l = vaddq_s32 (hsad_l, vmovl_s16(vget_low_s16 (diff)));
    642                 hsad_h = vaddq_s32 (hsad_h, vmovl_s16(vget_high_s16 (diff)));
    643                 vst1q_s32 ((hsad + d), hsad_l);
    644                 vst1q_s32 ((hsad + d + 4), hsad_h);
    645             }
    646         #else
    647             for( d = 0; d < ndisp; d++ )
    648             {
    649                 int diff = std::abs(lval - rptr[d]);
    650                 cbuf[d] = (uchar)diff;
    651                 hsad[d] = (int)(hsad[d] + diff);
    652             }
    653         #endif
    654             htext[y] += tab[lval];
    655         }
    656     }
    657 
    658     // initialize the left and right borders of the disparity map
    659     for( y = 0; y < height; y++ )
    660     {
    661         for( x = 0; x < lofs; x++ )
    662             dptr[y*dstep + x] = FILTERED;
    663         for( x = lofs + width1; x < width; x++ )
    664             dptr[y*dstep + x] = FILTERED;
    665     }
    666     dptr += lofs;
    667 
    668     for( x = 0; x < width1; x++, dptr++ )
    669     {
    670         int* costptr = cost.data ? cost.ptr<int>() + lofs + x : &costbuf;
    671         int x0 = x - wsz2 - 1, x1 = x + wsz2;
    672         const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
    673         cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
    674         hsad = hsad0 - dy0*ndisp;
    675         lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
    676         lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
    677         rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep;
    678 
    679         for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
    680             hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
    681         {
    682             int lval = lptr[0];
    683         #if CV_NEON
    684             int16x8_t lv = vdupq_n_s16 ((int16_t)lval);
    685             for( d = 0; d < ndisp; d += 8 )
    686             {
    687                 int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d)));
    688                 int32x4_t hsad_l = vld1q_s32 (hsad + d);
    689                 int32x4_t hsad_h = vld1q_s32 (hsad + d + 4);
    690                 int16x8_t cbs = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (cbuf_sub + d)));
    691                 int16x8_t diff = vabdq_s16 (lv, rv);
    692                 int32x4_t diff_h = vsubl_s16 (vget_high_s16 (diff), vget_high_s16 (cbs));
    693                 int32x4_t diff_l = vsubl_s16 (vget_low_s16 (diff), vget_low_s16 (cbs));
    694                 vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff)));
    695                 hsad_h = vaddq_s32 (hsad_h, diff_h);
    696                 hsad_l = vaddq_s32 (hsad_l, diff_l);
    697                 vst1q_s32 ((hsad + d), hsad_l);
    698                 vst1q_s32 ((hsad + d + 4), hsad_h);
    699             }
    700         #else
    701             for( d = 0; d < ndisp; d++ )
    702             {
    703                 int diff = std::abs(lval - rptr[d]);
    704                 cbuf[d] = (uchar)diff;
    705                 hsad[d] = hsad[d] + diff - cbuf_sub[d];
    706             }
    707         #endif
    708             htext[y] += tab[lval] - tab[lptr_sub[0]];
    709         }
    710 
    711         // fill borders
    712         for( y = dy1; y <= wsz2; y++ )
    713             htext[height+y] = htext[height+dy1-1];
    714         for( y = -wsz2-1; y < -dy0; y++ )
    715             htext[y] = htext[-dy0];
    716 
    717         // initialize sums
    718         for( d = 0; d < ndisp; d++ )
    719             sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
    720 
    721         hsad = hsad0 + (1 - dy0)*ndisp;
    722         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
    723         {
    724         #if CV_NEON
    725             for( d = 0; d <= ndisp-8; d += 8 )
    726             {
    727                 int32x4_t s0 = vld1q_s32 (sad + d);
    728                 int32x4_t s1 = vld1q_s32 (sad + d + 4);
    729                 int32x4_t t0 = vld1q_s32 (hsad + d);
    730                 int32x4_t t1 = vld1q_s32 (hsad + d + 4);
    731                 s0 = vaddq_s32 (s0, t0);
    732                 s1 = vaddq_s32 (s1, t1);
    733                 vst1q_s32 (sad + d, s0);
    734                 vst1q_s32 (sad + d + 4, s1);
    735             }
    736         #else
    737             for( d = 0; d < ndisp; d++ )
    738                 sad[d] = (int)(sad[d] + hsad[d]);
    739         #endif
    740         }
    741         int tsum = 0;
    742         for( y = -wsz2-1; y < wsz2; y++ )
    743             tsum += htext[y];
    744 
    745         // finally, start the real processing
    746         for( y = 0; y < height; y++ )
    747         {
    748             int minsad = INT_MAX, mind = -1;
    749             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
    750             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
    751         #if CV_NEON
    752             int32x4_t minsad4 = vdupq_n_s32 (INT_MAX);
    753             int32x4_t mind4 = vdupq_n_s32(0), d4 = d0_4;
    754 
    755             for( d = 0; d <= ndisp-8; d += 8 )
    756             {
    757                 int32x4_t u0 = vld1q_s32 (hsad_sub + d);
    758                 int32x4_t u1 = vld1q_s32 (hsad + d);
    759 
    760                 int32x4_t v0 = vld1q_s32 (hsad_sub + d + 4);
    761                 int32x4_t v1 = vld1q_s32 (hsad + d + 4);
    762 
    763                 int32x4_t usad4 = vld1q_s32(sad + d);
    764                 int32x4_t vsad4 = vld1q_s32(sad + d + 4);
    765 
    766                 u1 = vsubq_s32 (u1, u0);
    767                 v1 = vsubq_s32 (v1, v0);
    768                 usad4 = vaddq_s32 (usad4, u1);
    769                 vsad4 = vaddq_s32 (vsad4, v1);
    770 
    771                 uint32x4_t mask = vcgtq_s32 (minsad4, usad4);
    772                 minsad4 = vminq_s32 (minsad4, usad4);
    773                 mind4 = vbslq_s32(mask, d4, mind4);
    774 
    775                 vst1q_s32 (sad + d, usad4);
    776                 vst1q_s32 (sad + d + 4, vsad4);
    777                 d4 = vaddq_s32 (d4, dd_4);
    778 
    779                 mask = vcgtq_s32 (minsad4, vsad4);
    780                 minsad4 = vminq_s32 (minsad4, vsad4);
    781                 mind4 = vbslq_s32(mask, d4, mind4);
    782 
    783                 d4 = vaddq_s32 (d4, dd_4);
    784 
    785             }
    786             int32x2_t mind4_h = vget_high_s32 (mind4);
    787             int32x2_t mind4_l = vget_low_s32 (mind4);
    788             int32x2_t minsad4_h = vget_high_s32 (minsad4);
    789             int32x2_t minsad4_l = vget_low_s32 (minsad4);
    790 
    791             uint32x2_t mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l)));
    792             mind4_h = vbsl_s32 (mask, mind4_h, mind4_l);
    793             minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l);
    794 
    795             mind4_l = vext_s32 (mind4_h,mind4_h,1);
    796             minsad4_l = vext_s32 (minsad4_h,minsad4_h,1);
    797 
    798             mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l)));
    799             mind4_h = vbsl_s32 (mask, mind4_h, mind4_l);
    800             minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l);
    801 
    802             mind = (int) vget_lane_s32 (mind4_h, 0);
    803             minsad = sad[mind];
    804 
    805         #else
    806             for( d = 0; d < ndisp; d++ )
    807             {
    808                 int currsad = sad[d] + hsad[d] - hsad_sub[d];
    809                 sad[d] = currsad;
    810                 if( currsad < minsad )
    811                 {
    812                     minsad = currsad;
    813                     mind = d;
    814                 }
    815             }
    816         #endif
    817 
    818             tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
    819             if( tsum < textureThreshold )
    820             {
    821                 dptr[y*dstep] = FILTERED;
    822                 continue;
    823             }
    824 
    825             if( uniquenessRatio > 0 )
    826             {
    827                 int thresh = minsad + (minsad * uniquenessRatio/100);
    828                 for( d = 0; d < ndisp; d++ )
    829                 {
    830                     if( (d < mind-1 || d > mind+1) && sad[d] <= thresh)
    831                         break;
    832                 }
    833                 if( d < ndisp )
    834                 {
    835                     dptr[y*dstep] = FILTERED;
    836                     continue;
    837                 }
    838             }
    839 
    840             {
    841                 sad[-1] = sad[1];
    842                 sad[ndisp] = sad[ndisp-2];
    843                 int p = sad[mind+1], n = sad[mind-1];
    844                 d = p + n - 2*sad[mind] + std::abs(p - n);
    845                 dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
    846                 costptr[y*coststep] = sad[mind];
    847             }
    848         }
    849     }
    850 }
    851 
    852 static bool ocl_prefiltering(InputArray left0, InputArray right0, OutputArray left, OutputArray right, StereoBMParams* state)
    853 {
    854     if( state->preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE )
    855     {
    856         if(!ocl_prefilter_norm( left0, left, state->preFilterSize, state->preFilterCap))
    857             return false;
    858         if(!ocl_prefilter_norm( right0, right, state->preFilterSize, state->preFilterCap))
    859             return false;
    860     }
    861     else
    862     {
    863         if(!ocl_prefilter_xsobel( left0, left, state->preFilterCap ))
    864             return false;
    865         if(!ocl_prefilter_xsobel( right0, right, state->preFilterCap))
    866             return false;
    867     }
    868     return true;
    869 }
    870 
    871 struct PrefilterInvoker : public ParallelLoopBody
    872 {
    873     PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
    874                      uchar* buf0, uchar* buf1, StereoBMParams* _state)
    875     {
    876         imgs0[0] = &left0; imgs0[1] = &right0;
    877         imgs[0] = &left; imgs[1] = &right;
    878         buf[0] = buf0; buf[1] = buf1;
    879         state = _state;
    880     }
    881 
    882     void operator()( const Range& range ) const
    883     {
    884         for( int i = range.start; i < range.end; i++ )
    885         {
    886             if( state->preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE )
    887                 prefilterNorm( *imgs0[i], *imgs[i], state->preFilterSize, state->preFilterCap, buf[i] );
    888             else
    889                 prefilterXSobel( *imgs0[i], *imgs[i], state->preFilterCap );
    890         }
    891     }
    892 
    893     const Mat* imgs0[2];
    894     Mat* imgs[2];
    895     uchar* buf[2];
    896     StereoBMParams* state;
    897 };
    898 
    899 static bool ocl_stereobm( InputArray _left, InputArray _right,
    900                        OutputArray _disp, StereoBMParams* state)
    901 {
    902     int ndisp = state->numDisparities;
    903     int mindisp = state->minDisparity;
    904     int wsz = state->SADWindowSize;
    905     int wsz2 = wsz/2;
    906 
    907     ocl::Device devDef = ocl::Device::getDefault();
    908     int sizeX = devDef.isIntel() ? 32 : std::max(11, 27 - devDef.maxComputeUnits()),
    909         sizeY = sizeX - 1,
    910         N = ndisp * 2;
    911 
    912     cv::String opt = cv::format("-D DEFINE_KERNEL_STEREOBM -D MIN_DISP=%d -D NUM_DISP=%d"
    913                                 " -D BLOCK_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D WSZ=%d",
    914                                 mindisp, ndisp,
    915                                 sizeX, sizeY, wsz);
    916     ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, opt);
    917     if(k.empty())
    918         return false;
    919 
    920     UMat left = _left.getUMat(), right = _right.getUMat();
    921     int cols = left.cols, rows = left.rows;
    922 
    923     _disp.create(_left.size(), CV_16S);
    924     _disp.setTo((mindisp - 1) << 4);
    925     Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-wsz2-mindisp, rows-wsz2) );
    926     UMat disp = (_disp.getUMat())(roi);
    927 
    928     int globalX = (disp.cols + sizeX - 1) / sizeX,
    929         globalY = (disp.rows + sizeY - 1) / sizeY;
    930     size_t globalThreads[3] = {N, globalX, globalY};
    931     size_t localThreads[3]  = {N, 1, 1};
    932 
    933     int idx = 0;
    934     idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left));
    935     idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right));
    936     idx = k.set(idx, ocl::KernelArg::WriteOnlyNoSize(disp));
    937     idx = k.set(idx, rows);
    938     idx = k.set(idx, cols);
    939     idx = k.set(idx, state->textureThreshold);
    940     idx = k.set(idx, state->uniquenessRatio);
    941     return k.run(3, globalThreads, localThreads, false);
    942 }
    943 
    944 struct FindStereoCorrespInvoker : public ParallelLoopBody
    945 {
    946     FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,
    947                              Mat& _disp, StereoBMParams* _state,
    948                              int _nstripes, size_t _stripeBufSize,
    949                              bool _useShorts, Rect _validDisparityRect,
    950                              Mat& _slidingSumBuf, Mat& _cost )
    951     {
    952         left = &_left; right = &_right;
    953         disp = &_disp; state = _state;
    954         nstripes = _nstripes; stripeBufSize = _stripeBufSize;
    955         useShorts = _useShorts;
    956         validDisparityRect = _validDisparityRect;
    957         slidingSumBuf = &_slidingSumBuf;
    958         cost = &_cost;
    959     }
    960 
    961     void operator()( const Range& range ) const
    962     {
    963         int cols = left->cols, rows = left->rows;
    964         int _row0 = std::min(cvRound(range.start * rows / nstripes), rows);
    965         int _row1 = std::min(cvRound(range.end * rows / nstripes), rows);
    966         uchar *ptr = slidingSumBuf->ptr() + range.start * stripeBufSize;
    967         int FILTERED = (state->minDisparity - 1)*16;
    968 
    969         Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
    970         if( roi.height == 0 )
    971             return;
    972         int row0 = roi.y;
    973         int row1 = roi.y + roi.height;
    974 
    975         Mat part;
    976         if( row0 > _row0 )
    977         {
    978             part = disp->rowRange(_row0, row0);
    979             part = Scalar::all(FILTERED);
    980         }
    981         if( _row1 > row1 )
    982         {
    983             part = disp->rowRange(row1, _row1);
    984             part = Scalar::all(FILTERED);
    985         }
    986 
    987         Mat left_i = left->rowRange(row0, row1);
    988         Mat right_i = right->rowRange(row0, row1);
    989         Mat disp_i = disp->rowRange(row0, row1);
    990         Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
    991 
    992 #if CV_SSE2
    993         if( useShorts )
    994             findStereoCorrespondenceBM_SSE2( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
    995         else
    996 #endif
    997             findStereoCorrespondenceBM( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
    998 
    999         if( state->disp12MaxDiff >= 0 )
   1000             validateDisparity( disp_i, cost_i, state->minDisparity, state->numDisparities, state->disp12MaxDiff );
   1001 
   1002         if( roi.x > 0 )
   1003         {
   1004             part = disp_i.colRange(0, roi.x);
   1005             part = Scalar::all(FILTERED);
   1006         }
   1007         if( roi.x + roi.width < cols )
   1008         {
   1009             part = disp_i.colRange(roi.x + roi.width, cols);
   1010             part = Scalar::all(FILTERED);
   1011         }
   1012     }
   1013 
   1014 protected:
   1015     const Mat *left, *right;
   1016     Mat* disp, *slidingSumBuf, *cost;
   1017     StereoBMParams *state;
   1018 
   1019     int nstripes;
   1020     size_t stripeBufSize;
   1021     bool useShorts;
   1022     Rect validDisparityRect;
   1023 };
   1024 
   1025 class StereoBMImpl : public StereoBM
   1026 {
   1027 public:
   1028     StereoBMImpl()
   1029     {
   1030         params = StereoBMParams();
   1031     }
   1032 
   1033     StereoBMImpl( int _numDisparities, int _SADWindowSize )
   1034     {
   1035         params = StereoBMParams(_numDisparities, _SADWindowSize);
   1036     }
   1037 
   1038     void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
   1039     {
   1040         int dtype = disparr.fixedType() ? disparr.type() : params.dispType;
   1041         Size leftsize = leftarr.size();
   1042 
   1043         if (leftarr.size() != rightarr.size())
   1044             CV_Error( Error::StsUnmatchedSizes, "All the images must have the same size" );
   1045 
   1046         if (leftarr.type() != CV_8UC1 || rightarr.type() != CV_8UC1)
   1047             CV_Error( Error::StsUnsupportedFormat, "Both input images must have CV_8UC1" );
   1048 
   1049         if (dtype != CV_16SC1 && dtype != CV_32FC1)
   1050             CV_Error( Error::StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format" );
   1051 
   1052         if( params.preFilterType != PREFILTER_NORMALIZED_RESPONSE &&
   1053             params.preFilterType != PREFILTER_XSOBEL )
   1054             CV_Error( Error::StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE" );
   1055 
   1056         if( params.preFilterSize < 5 || params.preFilterSize > 255 || params.preFilterSize % 2 == 0 )
   1057             CV_Error( Error::StsOutOfRange, "preFilterSize must be odd and be within 5..255" );
   1058 
   1059         if( params.preFilterCap < 1 || params.preFilterCap > 63 )
   1060             CV_Error( Error::StsOutOfRange, "preFilterCap must be within 1..63" );
   1061 
   1062         if( params.SADWindowSize < 5 || params.SADWindowSize > 255 || params.SADWindowSize % 2 == 0 ||
   1063             params.SADWindowSize >= std::min(leftsize.width, leftsize.height) )
   1064             CV_Error( Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height" );
   1065 
   1066         if( params.numDisparities <= 0 || params.numDisparities % 16 != 0 )
   1067             CV_Error( Error::StsOutOfRange, "numDisparities must be positive and divisble by 16" );
   1068 
   1069         if( params.textureThreshold < 0 )
   1070             CV_Error( Error::StsOutOfRange, "texture threshold must be non-negative" );
   1071 
   1072         if( params.uniquenessRatio < 0 )
   1073             CV_Error( Error::StsOutOfRange, "uniqueness ratio must be non-negative" );
   1074 
   1075         int FILTERED = (params.minDisparity - 1) << DISPARITY_SHIFT;
   1076 
   1077         if(ocl::useOpenCL() && disparr.isUMat() && params.textureThreshold == 0)
   1078         {
   1079             UMat left, right;
   1080             if(ocl_prefiltering(leftarr, rightarr, left, right, &params))
   1081             {
   1082                 if(ocl_stereobm(left, right, disparr, &params))
   1083                 {
   1084                     if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
   1085                         filterSpeckles(disparr.getMat(), FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
   1086                     if (dtype == CV_32F)
   1087                         disparr.getUMat().convertTo(disparr, CV_32FC1, 1./(1 << DISPARITY_SHIFT), 0);
   1088                     CV_IMPL_ADD(CV_IMPL_OCL);
   1089                     return;
   1090                 }
   1091             }
   1092         }
   1093 
   1094         Mat left0 = leftarr.getMat(), right0 = rightarr.getMat();
   1095         disparr.create(left0.size(), dtype);
   1096         Mat disp0 = disparr.getMat();
   1097 
   1098         preFilteredImg0.create( left0.size(), CV_8U );
   1099         preFilteredImg1.create( left0.size(), CV_8U );
   1100         cost.create( left0.size(), CV_16S );
   1101 
   1102         Mat left = preFilteredImg0, right = preFilteredImg1;
   1103 
   1104         int mindisp = params.minDisparity;
   1105         int ndisp = params.numDisparities;
   1106 
   1107         int width = left0.cols;
   1108         int height = left0.rows;
   1109         int lofs = std::max(ndisp - 1 + mindisp, 0);
   1110         int rofs = -std::min(ndisp - 1 + mindisp, 0);
   1111         int width1 = width - rofs - ndisp + 1;
   1112 
   1113         if( lofs >= width || rofs >= width || width1 < 1 )
   1114         {
   1115             disp0 = Scalar::all( FILTERED * ( disp0.type() < CV_32F ? 1 : 1./(1 << DISPARITY_SHIFT) ) );
   1116             return;
   1117         }
   1118 
   1119         Mat disp = disp0;
   1120         if( dtype == CV_32F )
   1121         {
   1122             dispbuf.create(disp0.size(), CV_16S);
   1123             disp = dispbuf;
   1124         }
   1125 
   1126         int wsz = params.SADWindowSize;
   1127         int bufSize0 = (int)((ndisp + 2)*sizeof(int));
   1128         bufSize0 += (int)((height+wsz+2)*ndisp*sizeof(int));
   1129         bufSize0 += (int)((height + wsz + 2)*sizeof(int));
   1130         bufSize0 += (int)((height+wsz+2)*ndisp*(wsz+2)*sizeof(uchar) + 256);
   1131 
   1132         int bufSize1 = (int)((width + params.preFilterSize + 2) * sizeof(int) + 256);
   1133         int bufSize2 = 0;
   1134         if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
   1135             bufSize2 = width*height*(sizeof(Point_<short>) + sizeof(int) + sizeof(uchar));
   1136 
   1137 #if CV_SSE2
   1138         bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2);
   1139 #else
   1140         const bool useShorts = false;
   1141 #endif
   1142 
   1143         const double SAD_overhead_coeff = 10.0;
   1144         double N0 = 8000000 / (useShorts ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
   1145         double maxStripeSize = std::min(std::max(N0 / (width * ndisp), (wsz-1) * SAD_overhead_coeff), (double)height);
   1146         int nstripes = cvCeil(height / maxStripeSize);
   1147         int bufSize = std::max(bufSize0 * nstripes, std::max(bufSize1 * 2, bufSize2));
   1148 
   1149         if( slidingSumBuf.cols < bufSize )
   1150             slidingSumBuf.create( 1, bufSize, CV_8U );
   1151 
   1152         uchar *_buf = slidingSumBuf.ptr();
   1153 
   1154         parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, &params), 1);
   1155 
   1156         Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
   1157         validDisparityRect = getValidDisparityROI(R1.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
   1158                                                   R2.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
   1159                                                   params.minDisparity, params.numDisparities,
   1160                                                   params.SADWindowSize);
   1161 
   1162         parallel_for_(Range(0, nstripes),
   1163                       FindStereoCorrespInvoker(left, right, disp, &params, nstripes,
   1164                                                bufSize0, useShorts, validDisparityRect,
   1165                                                slidingSumBuf, cost));
   1166 
   1167         if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
   1168             filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
   1169 
   1170         if (disp0.data != disp.data)
   1171             disp.convertTo(disp0, disp0.type(), 1./(1 << DISPARITY_SHIFT), 0);
   1172     }
   1173 
   1174     int getMinDisparity() const { return params.minDisparity; }
   1175     void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
   1176 
   1177     int getNumDisparities() const { return params.numDisparities; }
   1178     void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
   1179 
   1180     int getBlockSize() const { return params.SADWindowSize; }
   1181     void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
   1182 
   1183     int getSpeckleWindowSize() const { return params.speckleWindowSize; }
   1184     void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
   1185 
   1186     int getSpeckleRange() const { return params.speckleRange; }
   1187     void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
   1188 
   1189     int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
   1190     void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
   1191 
   1192     int getPreFilterType() const { return params.preFilterType; }
   1193     void setPreFilterType(int preFilterType) { params.preFilterType = preFilterType; }
   1194 
   1195     int getPreFilterSize() const { return params.preFilterSize; }
   1196     void setPreFilterSize(int preFilterSize) { params.preFilterSize = preFilterSize; }
   1197 
   1198     int getPreFilterCap() const { return params.preFilterCap; }
   1199     void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
   1200 
   1201     int getTextureThreshold() const { return params.textureThreshold; }
   1202     void setTextureThreshold(int textureThreshold) { params.textureThreshold = textureThreshold; }
   1203 
   1204     int getUniquenessRatio() const { return params.uniquenessRatio; }
   1205     void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
   1206 
   1207     int getSmallerBlockSize() const { return 0; }
   1208     void setSmallerBlockSize(int) {}
   1209 
   1210     Rect getROI1() const { return params.roi1; }
   1211     void setROI1(Rect roi1) { params.roi1 = roi1; }
   1212 
   1213     Rect getROI2() const { return params.roi2; }
   1214     void setROI2(Rect roi2) { params.roi2 = roi2; }
   1215 
   1216     void write(FileStorage& fs) const
   1217     {
   1218         fs << "name" << name_
   1219         << "minDisparity" << params.minDisparity
   1220         << "numDisparities" << params.numDisparities
   1221         << "blockSize" << params.SADWindowSize
   1222         << "speckleWindowSize" << params.speckleWindowSize
   1223         << "speckleRange" << params.speckleRange
   1224         << "disp12MaxDiff" << params.disp12MaxDiff
   1225         << "preFilterType" << params.preFilterType
   1226         << "preFilterSize" << params.preFilterSize
   1227         << "preFilterCap" << params.preFilterCap
   1228         << "textureThreshold" << params.textureThreshold
   1229         << "uniquenessRatio" << params.uniquenessRatio;
   1230     }
   1231 
   1232     void read(const FileNode& fn)
   1233     {
   1234         FileNode n = fn["name"];
   1235         CV_Assert( n.isString() && String(n) == name_ );
   1236         params.minDisparity = (int)fn["minDisparity"];
   1237         params.numDisparities = (int)fn["numDisparities"];
   1238         params.SADWindowSize = (int)fn["blockSize"];
   1239         params.speckleWindowSize = (int)fn["speckleWindowSize"];
   1240         params.speckleRange = (int)fn["speckleRange"];
   1241         params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
   1242         params.preFilterType = (int)fn["preFilterType"];
   1243         params.preFilterSize = (int)fn["preFilterSize"];
   1244         params.preFilterCap = (int)fn["preFilterCap"];
   1245         params.textureThreshold = (int)fn["textureThreshold"];
   1246         params.uniquenessRatio = (int)fn["uniquenessRatio"];
   1247         params.roi1 = params.roi2 = Rect();
   1248     }
   1249 
   1250     StereoBMParams params;
   1251     Mat preFilteredImg0, preFilteredImg1, cost, dispbuf;
   1252     Mat slidingSumBuf;
   1253 
   1254     static const char* name_;
   1255 };
   1256 
   1257 const char* StereoBMImpl::name_ = "StereoMatcher.BM";
   1258 
   1259 Ptr<StereoBM> StereoBM::create(int _numDisparities, int _SADWindowSize)
   1260 {
   1261     return makePtr<StereoBMImpl>(_numDisparities, _SADWindowSize);
   1262 }
   1263 
   1264 }
   1265 
   1266 /* End of file. */
   1267