Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 /*
     45  This is a variation of
     46  "Stereo Processing by Semiglobal Matching and Mutual Information"
     47  by Heiko Hirschmuller.
     48 
     49  We match blocks rather than individual pixels, thus the algorithm is called
     50  SGBM (Semi-global block matching)
     51  */
     52 
     53 #include "precomp.hpp"
     54 #include <limits.h>
     55 
     56 namespace cv
     57 {
     58 
     59 typedef uchar PixType;
     60 typedef short CostType;
     61 typedef short DispType;
     62 
     63 enum { NR = 16, NR2 = NR/2 };
     64 
     65 
     66 struct StereoSGBMParams
     67 {
     68     StereoSGBMParams()
     69     {
     70         minDisparity = numDisparities = 0;
     71         SADWindowSize = 0;
     72         P1 = P2 = 0;
     73         disp12MaxDiff = 0;
     74         preFilterCap = 0;
     75         uniquenessRatio = 0;
     76         speckleWindowSize = 0;
     77         speckleRange = 0;
     78         mode = StereoSGBM::MODE_SGBM;
     79     }
     80 
     81     StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
     82                       int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
     83                       int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
     84                       int _mode )
     85     {
     86         minDisparity = _minDisparity;
     87         numDisparities = _numDisparities;
     88         SADWindowSize = _SADWindowSize;
     89         P1 = _P1;
     90         P2 = _P2;
     91         disp12MaxDiff = _disp12MaxDiff;
     92         preFilterCap = _preFilterCap;
     93         uniquenessRatio = _uniquenessRatio;
     94         speckleWindowSize = _speckleWindowSize;
     95         speckleRange = _speckleRange;
     96         mode = _mode;
     97     }
     98 
     99     int minDisparity;
    100     int numDisparities;
    101     int SADWindowSize;
    102     int preFilterCap;
    103     int uniquenessRatio;
    104     int P1;
    105     int P2;
    106     int speckleWindowSize;
    107     int speckleRange;
    108     int disp12MaxDiff;
    109     int mode;
    110 };
    111 
    112 /*
    113  For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
    114  and for each disparity minD<=d<maxD the function
    115  computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
    116  row1[x] and row2[x-d]. The subpixel algorithm from
    117  "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
    118  is used, hence the suffix BT.
    119 
    120  the temporary buffer should contain width2*2 elements
    121  */
    122 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
    123                             int minD, int maxD, CostType* cost,
    124                             PixType* buffer, const PixType* tab,
    125                             int tabOfs, int )
    126 {
    127     int x, c, width = img1.cols, cn = img1.channels();
    128     int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
    129     int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
    130     int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
    131     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
    132     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
    133 
    134     tab += tabOfs;
    135 
    136     for( c = 0; c < cn*2; c++ )
    137     {
    138         prow1[width*c] = prow1[width*c + width-1] =
    139         prow2[width*c] = prow2[width*c + width-1] = tab[0];
    140     }
    141 
    142     int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
    143     int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
    144 
    145     if( cn == 1 )
    146     {
    147         for( x = 1; x < width-1; x++ )
    148         {
    149             prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
    150             prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
    151 
    152             prow1[x+width] = row1[x];
    153             prow2[width-1-x+width] = row2[x];
    154         }
    155     }
    156     else
    157     {
    158         for( x = 1; x < width-1; x++ )
    159         {
    160             prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
    161             prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
    162             prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
    163 
    164             prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
    165             prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
    166             prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
    167 
    168             prow1[x+width*3] = row1[x*3];
    169             prow1[x+width*4] = row1[x*3+1];
    170             prow1[x+width*5] = row1[x*3+2];
    171 
    172             prow2[width-1-x+width*3] = row2[x*3];
    173             prow2[width-1-x+width*4] = row2[x*3+1];
    174             prow2[width-1-x+width*5] = row2[x*3+2];
    175         }
    176     }
    177 
    178     memset( cost, 0, width1*D*sizeof(cost[0]) );
    179 
    180     buffer -= minX2;
    181     cost -= minX1*D + minD; // simplify the cost indices inside the loop
    182 
    183 #if CV_SSE2
    184     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
    185 #endif
    186 
    187 #if 1
    188     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
    189     {
    190         int diff_scale = c < cn ? 0 : 2;
    191 
    192         // precompute
    193         //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
    194         //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
    195         for( x = minX2; x < maxX2; x++ )
    196         {
    197             int v = prow2[x];
    198             int vl = x > 0 ? (v + prow2[x-1])/2 : v;
    199             int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
    200             int v0 = std::min(vl, vr); v0 = std::min(v0, v);
    201             int v1 = std::max(vl, vr); v1 = std::max(v1, v);
    202             buffer[x] = (PixType)v0;
    203             buffer[x + width2] = (PixType)v1;
    204         }
    205 
    206         for( x = minX1; x < maxX1; x++ )
    207         {
    208             int u = prow1[x];
    209             int ul = x > 0 ? (u + prow1[x-1])/2 : u;
    210             int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
    211             int u0 = std::min(ul, ur); u0 = std::min(u0, u);
    212             int u1 = std::max(ul, ur); u1 = std::max(u1, u);
    213 
    214         #if CV_SSE2
    215             if( useSIMD )
    216             {
    217                 __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
    218                 __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
    219                 __m128i ds = _mm_cvtsi32_si128(diff_scale);
    220 
    221                 for( int d = minD; d < maxD; d += 16 )
    222                 {
    223                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
    224                     __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
    225                     __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
    226                     __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
    227                     __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
    228                     __m128i diff = _mm_min_epu8(c0, c1);
    229 
    230                     c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
    231                     c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
    232 
    233                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
    234                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
    235                 }
    236             }
    237             else
    238         #endif
    239             {
    240                 for( int d = minD; d < maxD; d++ )
    241                 {
    242                     int v = prow2[width-x-1 + d];
    243                     int v0 = buffer[width-x-1 + d];
    244                     int v1 = buffer[width-x-1 + d + width2];
    245                     int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
    246                     int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
    247 
    248                     cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
    249                 }
    250             }
    251         }
    252     }
    253 #else
    254     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
    255     {
    256         for( x = minX1; x < maxX1; x++ )
    257         {
    258             int u = prow1[x];
    259         #if CV_SSE2
    260             if( useSIMD )
    261             {
    262                 __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
    263 
    264                 for( int d = minD; d < maxD; d += 16 )
    265                 {
    266                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
    267                     __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
    268                     __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
    269                     __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
    270 
    271                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
    272                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
    273                 }
    274             }
    275             else
    276         #endif
    277             {
    278                 for( int d = minD; d < maxD; d++ )
    279                 {
    280                     int v = prow2[width-1-x + d];
    281                     cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
    282                 }
    283             }
    284         }
    285     }
    286 #endif
    287 }
    288 
    289 
    290 /*
    291  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
    292  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
    293  minD <= d < maxD.
    294  disp2full is the reverse disparity map, that is:
    295  disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
    296 
    297  note that disp1buf will have the same size as the roi and
    298  disp2full will have the same size as img1 (or img2).
    299  On exit disp2buf is not the final disparity, it is an intermediate result that becomes
    300  final after all the tiles are processed.
    301 
    302  the disparity in disp1buf is written with sub-pixel accuracy
    303  (4 fractional bits, see StereoSGBM::DISP_SCALE),
    304  using quadratic interpolation, while the disparity in disp2buf
    305  is written as is, without interpolation.
    306 
    307  disp2cost also has the same size as img1 (or img2).
    308  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
    309  */
    310 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
    311                                  Mat& disp1, const StereoSGBMParams& params,
    312                                  Mat& buffer )
    313 {
    314 #if CV_SSE2
    315     static const uchar LSBTab[] =
    316     {
    317         0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
    318         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
    319         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
    320         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
    321         7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
    322         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
    323         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
    324         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
    325     };
    326 
    327     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
    328 #endif
    329 
    330     const int ALIGN = 16;
    331     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
    332     const int DISP_SCALE = (1 << DISP_SHIFT);
    333     const CostType MAX_COST = SHRT_MAX;
    334 
    335     int minD = params.minDisparity, maxD = minD + params.numDisparities;
    336     Size SADWindowSize;
    337     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
    338     int ftzero = std::max(params.preFilterCap, 15) | 1;
    339     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
    340     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
    341     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
    342     int k, width = disp1.cols, height = disp1.rows;
    343     int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
    344     int D = maxD - minD, width1 = maxX1 - minX1;
    345     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
    346     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
    347     bool fullDP = params.mode == StereoSGBM::MODE_HH;
    348     int npasses = fullDP ? 2 : 1;
    349     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
    350     PixType clipTab[TAB_SIZE];
    351 
    352     for( k = 0; k < TAB_SIZE; k++ )
    353         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
    354 
    355     if( minX1 >= maxX1 )
    356     {
    357         disp1 = Scalar::all(INVALID_DISP_SCALED);
    358         return;
    359     }
    360 
    361     CV_Assert( D % 16 == 0 );
    362 
    363     // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
    364     // if you change NR, please, modify the loop as well.
    365     int D2 = D+16, NRD2 = NR2*D2;
    366 
    367     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
    368     // for 8-way dynamic programming we need the current row and
    369     // the previous row, i.e. 2 rows in total
    370     const int NLR = 2;
    371     const int LrBorder = NLR - 1;
    372 
    373     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
    374     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
    375     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
    376     size_t costBufSize = width1*D;
    377     size_t CSBufSize = costBufSize*(fullDP ? height : 1);
    378     size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
    379     int hsumBufNRows = SH2*2 + 2;
    380     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
    381     costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
    382     CSBufSize*2*sizeof(CostType) + // C, S
    383     width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
    384     width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
    385 
    386     if( buffer.empty() || !buffer.isContinuous() ||
    387         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
    388         buffer.create(1, (int)totalBufSize, CV_8U);
    389 
    390     // summary cost over different (nDirs) directions
    391     CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
    392     CostType* Sbuf = Cbuf + CSBufSize;
    393     CostType* hsumBuf = Sbuf + CSBufSize;
    394     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
    395 
    396     CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
    397     DispType* disp2ptr = (DispType*)(disp2cost + width);
    398     PixType* tempBuf = (PixType*)(disp2ptr + width);
    399 
    400     // add P2 to every C(x,y). it saves a few operations in the inner loops
    401     for( k = 0; k < width1*D; k++ )
    402         Cbuf[k] = (CostType)P2;
    403 
    404     for( int pass = 1; pass <= npasses; pass++ )
    405     {
    406         int x1, y1, x2, y2, dx, dy;
    407 
    408         if( pass == 1 )
    409         {
    410             y1 = 0; y2 = height; dy = 1;
    411             x1 = 0; x2 = width1; dx = 1;
    412         }
    413         else
    414         {
    415             y1 = height-1; y2 = -1; dy = -1;
    416             x1 = width1-1; x2 = -1; dx = -1;
    417         }
    418 
    419         CostType *Lr[NLR]={0}, *minLr[NLR]={0};
    420 
    421         for( k = 0; k < NLR; k++ )
    422         {
    423             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
    424             // and will occasionally use negative indices with the arrays
    425             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
    426             // however, then the alignment will be imperfect, i.e. bad for SSE,
    427             // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
    428             Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
    429             memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
    430             minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
    431             memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
    432         }
    433 
    434         for( int y = y1; y != y2; y += dy )
    435         {
    436             int x, d;
    437             DispType* disp1ptr = disp1.ptr<DispType>(y);
    438             CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
    439             CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
    440 
    441             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
    442             {
    443                 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
    444 
    445                 for( k = dy1; k <= dy2; k++ )
    446                 {
    447                     CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
    448 
    449                     if( k < height )
    450                     {
    451                         calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
    452 
    453                         memset(hsumAdd, 0, D*sizeof(CostType));
    454                         for( x = 0; x <= SW2*D; x += D )
    455                         {
    456                             int scale = x == 0 ? SW2 + 1 : 1;
    457                             for( d = 0; d < D; d++ )
    458                                 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
    459                         }
    460 
    461                         if( y > 0 )
    462                         {
    463                             const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
    464                             const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
    465 
    466                             for( x = D; x < width1*D; x += D )
    467                             {
    468                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
    469                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
    470 
    471                             #if CV_SSE2
    472                                 if( useSIMD )
    473                                 {
    474                                     for( d = 0; d < D; d += 8 )
    475                                     {
    476                                         __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
    477                                         __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
    478                                         hv = _mm_adds_epi16(_mm_subs_epi16(hv,
    479                                                                            _mm_load_si128((const __m128i*)(pixSub + d))),
    480                                                             _mm_load_si128((const __m128i*)(pixAdd + d)));
    481                                         Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
    482                                                                            _mm_load_si128((const __m128i*)(hsumSub + x + d))),
    483                                                             hv);
    484                                         _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
    485                                         _mm_store_si128((__m128i*)(C + x + d), Cx);
    486                                     }
    487                                 }
    488                                 else
    489                             #endif
    490                                 {
    491                                     for( d = 0; d < D; d++ )
    492                                     {
    493                                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
    494                                         C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
    495                                     }
    496                                 }
    497                             }
    498                         }
    499                         else
    500                         {
    501                             for( x = D; x < width1*D; x += D )
    502                             {
    503                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
    504                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
    505 
    506                                 for( d = 0; d < D; d++ )
    507                                     hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
    508                             }
    509                         }
    510                     }
    511 
    512                     if( y == 0 )
    513                     {
    514                         int scale = k == 0 ? SH2 + 1 : 1;
    515                         for( x = 0; x < width1*D; x++ )
    516                             C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
    517                     }
    518                 }
    519 
    520                 // also, clear the S buffer
    521                 for( k = 0; k < width1*D; k++ )
    522                     S[k] = 0;
    523             }
    524 
    525             // clear the left and the right borders
    526             memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
    527             memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
    528             memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
    529             memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
    530 
    531             /*
    532              [formula 13 in the paper]
    533              compute L_r(p, d) = C(p, d) +
    534              min(L_r(p-r, d),
    535              L_r(p-r, d-1) + P1,
    536              L_r(p-r, d+1) + P1,
    537              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
    538              where p = (x,y), r is one of the directions.
    539              we process all the directions at once:
    540              0: r=(-dx, 0)
    541              1: r=(-1, -dy)
    542              2: r=(0, -dy)
    543              3: r=(1, -dy)
    544              4: r=(-2, -dy)
    545              5: r=(-1, -dy*2)
    546              6: r=(1, -dy*2)
    547              7: r=(2, -dy)
    548              */
    549             for( x = x1; x != x2; x += dx )
    550             {
    551                 int xm = x*NR2, xd = xm*D2;
    552 
    553                 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
    554                 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
    555 
    556                 CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
    557                 CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
    558                 CostType* Lr_p2 = Lr[1] + xd + D2*2;
    559                 CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
    560 
    561                 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
    562                 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
    563 
    564                 CostType* Lr_p = Lr[0] + xd;
    565                 const CostType* Cp = C + x*D;
    566                 CostType* Sp = S + x*D;
    567 
    568             #if CV_SSE2
    569                 if( useSIMD )
    570                 {
    571                     __m128i _P1 = _mm_set1_epi16((short)P1);
    572 
    573                     __m128i _delta0 = _mm_set1_epi16((short)delta0);
    574                     __m128i _delta1 = _mm_set1_epi16((short)delta1);
    575                     __m128i _delta2 = _mm_set1_epi16((short)delta2);
    576                     __m128i _delta3 = _mm_set1_epi16((short)delta3);
    577                     __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
    578 
    579                     for( d = 0; d < D; d += 8 )
    580                     {
    581                         __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
    582                         __m128i L0, L1, L2, L3;
    583 
    584                         L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
    585                         L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
    586                         L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
    587                         L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
    588 
    589                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
    590                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
    591 
    592                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
    593                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
    594 
    595                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
    596                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
    597 
    598                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
    599                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
    600 
    601                         L0 = _mm_min_epi16(L0, _delta0);
    602                         L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
    603 
    604                         L1 = _mm_min_epi16(L1, _delta1);
    605                         L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
    606 
    607                         L2 = _mm_min_epi16(L2, _delta2);
    608                         L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
    609 
    610                         L3 = _mm_min_epi16(L3, _delta3);
    611                         L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
    612 
    613                         _mm_store_si128( (__m128i*)(Lr_p + d), L0);
    614                         _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
    615                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
    616                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
    617 
    618                         __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
    619                         __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
    620                         t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
    621                         _minL0 = _mm_min_epi16(_minL0, t0);
    622 
    623                         __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
    624 
    625                         L0 = _mm_adds_epi16(L0, L1);
    626                         L2 = _mm_adds_epi16(L2, L3);
    627                         Sval = _mm_adds_epi16(Sval, L0);
    628                         Sval = _mm_adds_epi16(Sval, L2);
    629 
    630                         _mm_store_si128((__m128i*)(Sp + d), Sval);
    631                     }
    632 
    633                     _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
    634                     _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
    635                 }
    636                 else
    637             #endif
    638                 {
    639                     int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
    640 
    641                     for( d = 0; d < D; d++ )
    642                     {
    643                         int Cpd = Cp[d], L0, L1, L2, L3;
    644 
    645                         L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
    646                         L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
    647                         L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
    648                         L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
    649 
    650                         Lr_p[d] = (CostType)L0;
    651                         minL0 = std::min(minL0, L0);
    652 
    653                         Lr_p[d + D2] = (CostType)L1;
    654                         minL1 = std::min(minL1, L1);
    655 
    656                         Lr_p[d + D2*2] = (CostType)L2;
    657                         minL2 = std::min(minL2, L2);
    658 
    659                         Lr_p[d + D2*3] = (CostType)L3;
    660                         minL3 = std::min(minL3, L3);
    661 
    662                         Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
    663                     }
    664                     minLr[0][xm] = (CostType)minL0;
    665                     minLr[0][xm+1] = (CostType)minL1;
    666                     minLr[0][xm+2] = (CostType)minL2;
    667                     minLr[0][xm+3] = (CostType)minL3;
    668                 }
    669             }
    670 
    671             if( pass == npasses )
    672             {
    673                 for( x = 0; x < width; x++ )
    674                 {
    675                     disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
    676                     disp2cost[x] = MAX_COST;
    677                 }
    678 
    679                 for( x = width1 - 1; x >= 0; x-- )
    680                 {
    681                     CostType* Sp = S + x*D;
    682                     int minS = MAX_COST, bestDisp = -1;
    683 
    684                     if( npasses == 1 )
    685                     {
    686                         int xm = x*NR2, xd = xm*D2;
    687 
    688                         int minL0 = MAX_COST;
    689                         int delta0 = minLr[0][xm + NR2] + P2;
    690                         CostType* Lr_p0 = Lr[0] + xd + NRD2;
    691                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
    692                         CostType* Lr_p = Lr[0] + xd;
    693 
    694                         const CostType* Cp = C + x*D;
    695 
    696                     #if CV_SSE2
    697                         if( useSIMD )
    698                         {
    699                             __m128i _P1 = _mm_set1_epi16((short)P1);
    700                             __m128i _delta0 = _mm_set1_epi16((short)delta0);
    701 
    702                             __m128i _minL0 = _mm_set1_epi16((short)minL0);
    703                             __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
    704                             __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
    705 
    706                             for( d = 0; d < D; d += 8 )
    707                             {
    708                                 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
    709 
    710                                 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
    711                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
    712                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
    713                                 L0 = _mm_min_epi16(L0, _delta0);
    714                                 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
    715 
    716                                 _mm_store_si128((__m128i*)(Lr_p + d), L0);
    717                                 _minL0 = _mm_min_epi16(_minL0, L0);
    718                                 L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
    719                                 _mm_store_si128((__m128i*)(Sp + d), L0);
    720 
    721                                 __m128i mask = _mm_cmpgt_epi16(_minS, L0);
    722                                 _minS = _mm_min_epi16(_minS, L0);
    723                                 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
    724                                 _d8 = _mm_adds_epi16(_d8, _8);
    725                             }
    726 
    727                             short CV_DECL_ALIGNED(16) bestDispBuf[8];
    728                             _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
    729 
    730                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
    731                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
    732                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
    733 
    734                             __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
    735                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
    736                             qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
    737 
    738                             minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
    739                             minS = (CostType)_mm_cvtsi128_si32(qS);
    740 
    741                             qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
    742                             qS = _mm_cmpeq_epi16(_minS, qS);
    743                             int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
    744 
    745                             bestDisp = bestDispBuf[LSBTab[idx]];
    746                         }
    747                         else
    748                     #endif
    749                         {
    750                             for( d = 0; d < D; d++ )
    751                             {
    752                                 int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
    753 
    754                                 Lr_p[d] = (CostType)L0;
    755                                 minL0 = std::min(minL0, L0);
    756 
    757                                 int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
    758                                 if( Sval < minS )
    759                                 {
    760                                     minS = Sval;
    761                                     bestDisp = d;
    762                                 }
    763                             }
    764                             minLr[0][xm] = (CostType)minL0;
    765                         }
    766                     }
    767                     else
    768                     {
    769                         for( d = 0; d < D; d++ )
    770                         {
    771                             int Sval = Sp[d];
    772                             if( Sval < minS )
    773                             {
    774                                 minS = Sval;
    775                                 bestDisp = d;
    776                             }
    777                         }
    778                     }
    779 
    780                     for( d = 0; d < D; d++ )
    781                     {
    782                         if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
    783                             break;
    784                     }
    785                     if( d < D )
    786                         continue;
    787                     d = bestDisp;
    788                     int _x2 = x + minX1 - d - minD;
    789                     if( disp2cost[_x2] > minS )
    790                     {
    791                         disp2cost[_x2] = (CostType)minS;
    792                         disp2ptr[_x2] = (DispType)(d + minD);
    793                     }
    794 
    795                     if( 0 < d && d < D-1 )
    796                     {
    797                         // do subpixel quadratic interpolation:
    798                         //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
    799                         //   then find minimum of the parabola.
    800                         int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
    801                         d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
    802                     }
    803                     else
    804                         d *= DISP_SCALE;
    805                     disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
    806                 }
    807 
    808                 for( x = minX1; x < maxX1; x++ )
    809                 {
    810                     // we round the computed disparity both towards -inf and +inf and check
    811                     // if either of the corresponding disparities in disp2 is consistent.
    812                     // This is to give the computed disparity a chance to look valid if it is.
    813                     int d1 = disp1ptr[x];
    814                     if( d1 == INVALID_DISP_SCALED )
    815                         continue;
    816                     int _d = d1 >> DISP_SHIFT;
    817                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
    818                     int _x = x - _d, x_ = x - d_;
    819                     if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
    820                        0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
    821                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
    822                 }
    823             }
    824 
    825             // now shift the cyclic buffers
    826             std::swap( Lr[0], Lr[1] );
    827             std::swap( minLr[0], minLr[1] );
    828         }
    829     }
    830 }
    831 
    832 class StereoSGBMImpl : public StereoSGBM
    833 {
    834 public:
    835     StereoSGBMImpl()
    836     {
    837         params = StereoSGBMParams();
    838     }
    839 
    840     StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
    841                     int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
    842                     int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
    843                     int _mode )
    844     {
    845         params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
    846                                    _P1, _P2, _disp12MaxDiff, _preFilterCap,
    847                                    _uniquenessRatio, _speckleWindowSize, _speckleRange,
    848                                    _mode );
    849     }
    850 
    851     void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
    852     {
    853         Mat left = leftarr.getMat(), right = rightarr.getMat();
    854         CV_Assert( left.size() == right.size() && left.type() == right.type() &&
    855                    left.depth() == CV_8U );
    856 
    857         disparr.create( left.size(), CV_16S );
    858         Mat disp = disparr.getMat();
    859 
    860         computeDisparitySGBM( left, right, disp, params, buffer );
    861         medianBlur(disp, disp, 3);
    862 
    863         if( params.speckleWindowSize > 0 )
    864             filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
    865                            StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
    866     }
    867 
    868     int getMinDisparity() const { return params.minDisparity; }
    869     void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
    870 
    871     int getNumDisparities() const { return params.numDisparities; }
    872     void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
    873 
    874     int getBlockSize() const { return params.SADWindowSize; }
    875     void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
    876 
    877     int getSpeckleWindowSize() const { return params.speckleWindowSize; }
    878     void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
    879 
    880     int getSpeckleRange() const { return params.speckleRange; }
    881     void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
    882 
    883     int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
    884     void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
    885 
    886     int getPreFilterCap() const { return params.preFilterCap; }
    887     void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
    888 
    889     int getUniquenessRatio() const { return params.uniquenessRatio; }
    890     void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
    891 
    892     int getP1() const { return params.P1; }
    893     void setP1(int P1) { params.P1 = P1; }
    894 
    895     int getP2() const { return params.P2; }
    896     void setP2(int P2) { params.P2 = P2; }
    897 
    898     int getMode() const { return params.mode; }
    899     void setMode(int mode) { params.mode = mode; }
    900 
    901     void write(FileStorage& fs) const
    902     {
    903         fs << "name" << name_
    904         << "minDisparity" << params.minDisparity
    905         << "numDisparities" << params.numDisparities
    906         << "blockSize" << params.SADWindowSize
    907         << "speckleWindowSize" << params.speckleWindowSize
    908         << "speckleRange" << params.speckleRange
    909         << "disp12MaxDiff" << params.disp12MaxDiff
    910         << "preFilterCap" << params.preFilterCap
    911         << "uniquenessRatio" << params.uniquenessRatio
    912         << "P1" << params.P1
    913         << "P2" << params.P2
    914         << "mode" << params.mode;
    915     }
    916 
    917     void read(const FileNode& fn)
    918     {
    919         FileNode n = fn["name"];
    920         CV_Assert( n.isString() && String(n) == name_ );
    921         params.minDisparity = (int)fn["minDisparity"];
    922         params.numDisparities = (int)fn["numDisparities"];
    923         params.SADWindowSize = (int)fn["blockSize"];
    924         params.speckleWindowSize = (int)fn["speckleWindowSize"];
    925         params.speckleRange = (int)fn["speckleRange"];
    926         params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
    927         params.preFilterCap = (int)fn["preFilterCap"];
    928         params.uniquenessRatio = (int)fn["uniquenessRatio"];
    929         params.P1 = (int)fn["P1"];
    930         params.P2 = (int)fn["P2"];
    931         params.mode = (int)fn["mode"];
    932     }
    933 
    934     StereoSGBMParams params;
    935     Mat buffer;
    936     static const char* name_;
    937 };
    938 
    939 const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
    940 
    941 
    942 Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
    943                                  int P1, int P2, int disp12MaxDiff,
    944                                  int preFilterCap, int uniquenessRatio,
    945                                  int speckleWindowSize, int speckleRange,
    946                                  int mode)
    947 {
    948     return Ptr<StereoSGBM>(
    949         new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
    950                            P1, P2, disp12MaxDiff,
    951                            preFilterCap, uniquenessRatio,
    952                            speckleWindowSize, speckleRange,
    953                            mode));
    954 }
    955 
    956 Rect getValidDisparityROI( Rect roi1, Rect roi2,
    957                           int minDisparity,
    958                           int numberOfDisparities,
    959                           int SADWindowSize )
    960 {
    961     int SW2 = SADWindowSize/2;
    962     int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
    963 
    964     int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
    965     int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
    966     int ymin = std::max(roi1.y, roi2.y) + SW2;
    967     int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
    968 
    969     Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
    970 
    971     return r.width > 0 && r.height > 0 ? r : Rect();
    972 }
    973 
    974 typedef cv::Point_<short> Point2s;
    975 
    976 template <typename T>
    977 void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
    978 {
    979     using namespace cv;
    980 
    981     int width = img.cols, height = img.rows, npixels = width*height;
    982     size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
    983     if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
    984         _buf.create(1, (int)bufSize, CV_8U);
    985 
    986     uchar* buf = _buf.ptr();
    987     int i, j, dstep = (int)(img.step/sizeof(T));
    988     int* labels = (int*)buf;
    989     buf += npixels*sizeof(labels[0]);
    990     Point2s* wbuf = (Point2s*)buf;
    991     buf += npixels*sizeof(wbuf[0]);
    992     uchar* rtype = (uchar*)buf;
    993     int curlabel = 0;
    994 
    995     // clear out label assignments
    996     memset(labels, 0, npixels*sizeof(labels[0]));
    997 
    998     for( i = 0; i < height; i++ )
    999     {
   1000         T* ds = img.ptr<T>(i);
   1001         int* ls = labels + width*i;
   1002 
   1003         for( j = 0; j < width; j++ )
   1004         {
   1005             if( ds[j] != newVal )   // not a bad disparity
   1006             {
   1007                 if( ls[j] )     // has a label, check for bad label
   1008                 {
   1009                     if( rtype[ls[j]] ) // small region, zero out disparity
   1010                         ds[j] = (T)newVal;
   1011                 }
   1012                 // no label, assign and propagate
   1013                 else
   1014                 {
   1015                     Point2s* ws = wbuf; // initialize wavefront
   1016                     Point2s p((short)j, (short)i);  // current pixel
   1017                     curlabel++; // next label
   1018                     int count = 0;  // current region size
   1019                     ls[j] = curlabel;
   1020 
   1021                     // wavefront propagation
   1022                     while( ws >= wbuf ) // wavefront not empty
   1023                     {
   1024                         count++;
   1025                         // put neighbors onto wavefront
   1026                         T* dpp = &img.at<T>(p.y, p.x);
   1027                         T dp = *dpp;
   1028                         int* lpp = labels + width*p.y + p.x;
   1029 
   1030                         if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
   1031                         {
   1032                             lpp[+width] = curlabel;
   1033                             *ws++ = Point2s(p.x, p.y+1);
   1034                         }
   1035 
   1036                         if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
   1037                         {
   1038                             lpp[-width] = curlabel;
   1039                             *ws++ = Point2s(p.x, p.y-1);
   1040                         }
   1041 
   1042                         if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
   1043                         {
   1044                             lpp[+1] = curlabel;
   1045                             *ws++ = Point2s(p.x+1, p.y);
   1046                         }
   1047 
   1048                         if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
   1049                         {
   1050                             lpp[-1] = curlabel;
   1051                             *ws++ = Point2s(p.x-1, p.y);
   1052                         }
   1053 
   1054                         // pop most recent and propagate
   1055                         // NB: could try least recent, maybe better convergence
   1056                         p = *--ws;
   1057                     }
   1058 
   1059                     // assign label type
   1060                     if( count <= maxSpeckleSize )   // speckle region
   1061                     {
   1062                         rtype[ls[j]] = 1;   // small region label
   1063                         ds[j] = (T)newVal;
   1064                     }
   1065                     else
   1066                         rtype[ls[j]] = 0;   // large region label
   1067                 }
   1068             }
   1069         }
   1070     }
   1071 }
   1072 
   1073 }
   1074 
   1075 void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
   1076                          double _maxDiff, InputOutputArray __buf )
   1077 {
   1078     Mat img = _img.getMat();
   1079     int type = img.type();
   1080     Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
   1081     CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
   1082 
   1083     int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
   1084 
   1085 #if IPP_VERSION_X100 >= 801
   1086     CV_IPP_CHECK()
   1087     {
   1088         Ipp32s bufsize = 0;
   1089         IppiSize roisize = { img.cols, img.rows };
   1090         IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;
   1091 
   1092         if (!__buf.needed() && (type == CV_8UC1 || type == CV_16SC1))
   1093         {
   1094             IppStatus status = ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize);
   1095             Ipp8u * buffer = ippsMalloc_8u(bufsize);
   1096 
   1097             if ((int)status >= 0)
   1098             {
   1099                 if (type == CV_8UC1)
   1100                     status = ippiMarkSpeckles_8u_C1IR(img.ptr<Ipp8u>(), (int)img.step, roisize,
   1101                                                       (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer);
   1102                 else
   1103                     status = ippiMarkSpeckles_16s_C1IR(img.ptr<Ipp16s>(), (int)img.step, roisize,
   1104                                                        (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer);
   1105             }
   1106 
   1107             if (status >= 0)
   1108             {
   1109                 CV_IMPL_ADD(CV_IMPL_IPP);
   1110                 return;
   1111             }
   1112             setIppErrorStatus();
   1113         }
   1114     }
   1115 #endif
   1116 
   1117     if (type == CV_8UC1)
   1118         filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
   1119     else
   1120         filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
   1121 }
   1122 
   1123 void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
   1124                             int numberOfDisparities, int disp12MaxDiff )
   1125 {
   1126     Mat disp = _disp.getMat(), cost = _cost.getMat();
   1127     int cols = disp.cols, rows = disp.rows;
   1128     int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
   1129     int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
   1130     AutoBuffer<int> _disp2buf(cols*2);
   1131     int* disp2buf = _disp2buf;
   1132     int* disp2cost = disp2buf + cols;
   1133     const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
   1134     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
   1135     int costType = cost.type();
   1136 
   1137     disp12MaxDiff *= DISP_SCALE;
   1138 
   1139     CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
   1140               (costType == CV_16S || costType == CV_32S) &&
   1141               disp.size() == cost.size() );
   1142 
   1143     for( int y = 0; y < rows; y++ )
   1144     {
   1145         short* dptr = disp.ptr<short>(y);
   1146 
   1147         for( x = 0; x < cols; x++ )
   1148         {
   1149             disp2buf[x] = INVALID_DISP_SCALED;
   1150             disp2cost[x] = INT_MAX;
   1151         }
   1152 
   1153         if( costType == CV_16S )
   1154         {
   1155             const short* cptr = cost.ptr<short>(y);
   1156 
   1157             for( x = minX1; x < maxX1; x++ )
   1158             {
   1159                 int d = dptr[x], c = cptr[x];
   1160                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
   1161 
   1162                 if( disp2cost[x2] > c )
   1163                 {
   1164                     disp2cost[x2] = c;
   1165                     disp2buf[x2] = d;
   1166                 }
   1167             }
   1168         }
   1169         else
   1170         {
   1171             const int* cptr = cost.ptr<int>(y);
   1172 
   1173             for( x = minX1; x < maxX1; x++ )
   1174             {
   1175                 int d = dptr[x], c = cptr[x];
   1176                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
   1177 
   1178                 if( disp2cost[x2] < c )
   1179                 {
   1180                     disp2cost[x2] = c;
   1181                     disp2buf[x2] = d;
   1182                 }
   1183             }
   1184         }
   1185 
   1186         for( x = minX1; x < maxX1; x++ )
   1187         {
   1188             // we round the computed disparity both towards -inf and +inf and check
   1189             // if either of the corresponding disparities in disp2 is consistent.
   1190             // This is to give the computed disparity a chance to look valid if it is.
   1191             int d = dptr[x];
   1192             if( d == INVALID_DISP_SCALED )
   1193                 continue;
   1194             int d0 = d >> DISP_SHIFT;
   1195             int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
   1196             int x0 = x - d0, x1 = x - d1;
   1197             if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
   1198                 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
   1199                 dptr[x] = (short)INVALID_DISP_SCALED;
   1200         }
   1201     }
   1202 }
   1203