Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #include "precomp.hpp"
     44 #include "opencl_kernels_imgproc.hpp"
     45 
     46 /****************************************************************************************\
     47                                     Base Image Filter
     48 \****************************************************************************************/
     49 
     50 #if IPP_VERSION_X100 >= 701
     51 #define USE_IPP_SEP_FILTERS 1
     52 #else
     53 #undef USE_IPP_SEP_FILTERS
     54 #endif
     55 
     56 namespace cv
     57 {
     58 
     59 BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
     60 BaseRowFilter::~BaseRowFilter() {}
     61 
     62 BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
     63 BaseColumnFilter::~BaseColumnFilter() {}
     64 void BaseColumnFilter::reset() {}
     65 
     66 BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
     67 BaseFilter::~BaseFilter() {}
     68 void BaseFilter::reset() {}
     69 
     70 FilterEngine::FilterEngine()
     71 {
     72     srcType = dstType = bufType = -1;
     73     rowBorderType = columnBorderType = BORDER_REPLICATE;
     74     bufStep = startY = startY0 = endY = rowCount = dstY = 0;
     75     maxWidth = 0;
     76 
     77     wholeSize = Size(-1,-1);
     78 }
     79 
     80 
     81 FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
     82                             const Ptr<BaseRowFilter>& _rowFilter,
     83                             const Ptr<BaseColumnFilter>& _columnFilter,
     84                             int _srcType, int _dstType, int _bufType,
     85                             int _rowBorderType, int _columnBorderType,
     86                             const Scalar& _borderValue )
     87 {
     88     init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
     89          _rowBorderType, _columnBorderType, _borderValue);
     90 }
     91 
     92 FilterEngine::~FilterEngine()
     93 {
     94 }
     95 
     96 
     97 void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
     98                          const Ptr<BaseRowFilter>& _rowFilter,
     99                          const Ptr<BaseColumnFilter>& _columnFilter,
    100                          int _srcType, int _dstType, int _bufType,
    101                          int _rowBorderType, int _columnBorderType,
    102                          const Scalar& _borderValue )
    103 {
    104     _srcType = CV_MAT_TYPE(_srcType);
    105     _bufType = CV_MAT_TYPE(_bufType);
    106     _dstType = CV_MAT_TYPE(_dstType);
    107 
    108     srcType = _srcType;
    109     int srcElemSize = (int)getElemSize(srcType);
    110     dstType = _dstType;
    111     bufType = _bufType;
    112 
    113     filter2D = _filter2D;
    114     rowFilter = _rowFilter;
    115     columnFilter = _columnFilter;
    116 
    117     if( _columnBorderType < 0 )
    118         _columnBorderType = _rowBorderType;
    119 
    120     rowBorderType = _rowBorderType;
    121     columnBorderType = _columnBorderType;
    122 
    123     CV_Assert( columnBorderType != BORDER_WRAP );
    124 
    125     if( isSeparable() )
    126     {
    127         CV_Assert( rowFilter && columnFilter );
    128         ksize = Size(rowFilter->ksize, columnFilter->ksize);
    129         anchor = Point(rowFilter->anchor, columnFilter->anchor);
    130     }
    131     else
    132     {
    133         CV_Assert( bufType == srcType );
    134         ksize = filter2D->ksize;
    135         anchor = filter2D->anchor;
    136     }
    137 
    138     CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
    139                0 <= anchor.y && anchor.y < ksize.height );
    140 
    141     borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
    142     int borderLength = std::max(ksize.width - 1, 1);
    143     borderTab.resize(borderLength*borderElemSize);
    144 
    145     maxWidth = bufStep = 0;
    146     constBorderRow.clear();
    147 
    148     if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
    149     {
    150         constBorderValue.resize(srcElemSize*borderLength);
    151         int srcType1 = CV_MAKETYPE(CV_MAT_DEPTH(srcType), MIN(CV_MAT_CN(srcType), 4));
    152         scalarToRawData(_borderValue, &constBorderValue[0], srcType1,
    153                         borderLength*CV_MAT_CN(srcType));
    154     }
    155 
    156     wholeSize = Size(-1,-1);
    157 }
    158 
    159 #define VEC_ALIGN CV_MALLOC_ALIGN
    160 
    161 int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
    162 {
    163     int i, j;
    164 
    165     wholeSize = _wholeSize;
    166     roi = _roi;
    167     CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
    168         roi.x + roi.width <= wholeSize.width &&
    169         roi.y + roi.height <= wholeSize.height );
    170 
    171     int esz = (int)getElemSize(srcType);
    172     int bufElemSize = (int)getElemSize(bufType);
    173     const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
    174 
    175     if( _maxBufRows < 0 )
    176         _maxBufRows = ksize.height + 3;
    177     _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
    178 
    179     if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
    180     {
    181         rows.resize(_maxBufRows);
    182         maxWidth = std::max(maxWidth, roi.width);
    183         int cn = CV_MAT_CN(srcType);
    184         srcRow.resize(esz*(maxWidth + ksize.width - 1));
    185         if( columnBorderType == BORDER_CONSTANT )
    186         {
    187             constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN));
    188             uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
    189             int n = (int)constBorderValue.size(), N;
    190             N = (maxWidth + ksize.width - 1)*esz;
    191             tdst = isSeparable() ? &srcRow[0] : dst;
    192 
    193             for( i = 0; i < N; i += n )
    194             {
    195                 n = std::min( n, N - i );
    196                 for(j = 0; j < n; j++)
    197                     tdst[i+j] = constVal[j];
    198             }
    199 
    200             if( isSeparable() )
    201                 (*rowFilter)(&srcRow[0], dst, maxWidth, cn);
    202         }
    203 
    204         int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
    205             (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
    206         ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
    207     }
    208 
    209     // adjust bufstep so that the used part of the ring buffer stays compact in memory
    210     bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
    211 
    212     dx1 = std::max(anchor.x - roi.x, 0);
    213     dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
    214 
    215     // recompute border tables
    216     if( dx1 > 0 || dx2 > 0 )
    217     {
    218         if( rowBorderType == BORDER_CONSTANT )
    219         {
    220             int nr = isSeparable() ? 1 : (int)rows.size();
    221             for( i = 0; i < nr; i++ )
    222             {
    223                 uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
    224                 memcpy( dst, constVal, dx1*esz );
    225                 memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
    226             }
    227         }
    228         else
    229         {
    230             int xofs1 = std::min(roi.x, anchor.x) - roi.x;
    231 
    232             int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
    233             int* btab = (int*)&borderTab[0];
    234 
    235             for( i = 0; i < dx1; i++ )
    236             {
    237                 int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz;
    238                 for( j = 0; j < btab_esz; j++ )
    239                     btab[i*btab_esz + j] = p0 + j;
    240             }
    241 
    242             for( i = 0; i < dx2; i++ )
    243             {
    244                 int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz;
    245                 for( j = 0; j < btab_esz; j++ )
    246                     btab[(i + dx1)*btab_esz + j] = p0 + j;
    247             }
    248         }
    249     }
    250 
    251     rowCount = dstY = 0;
    252     startY = startY0 = std::max(roi.y - anchor.y, 0);
    253     endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
    254     if( columnFilter )
    255         columnFilter->reset();
    256     if( filter2D )
    257         filter2D->reset();
    258 
    259     return startY;
    260 }
    261 
    262 
    263 int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
    264                         bool isolated, int maxBufRows)
    265 {
    266     Rect srcRoi = _srcRoi;
    267 
    268     if( srcRoi == Rect(0,0,-1,-1) )
    269         srcRoi = Rect(0,0,src.cols,src.rows);
    270 
    271     CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
    272         srcRoi.width >= 0 && srcRoi.height >= 0 &&
    273         srcRoi.x + srcRoi.width <= src.cols &&
    274         srcRoi.y + srcRoi.height <= src.rows );
    275 
    276     Point ofs;
    277     Size wsz(src.cols, src.rows);
    278     if( !isolated )
    279         src.locateROI( wsz, ofs );
    280     start( wsz, srcRoi + ofs, maxBufRows );
    281 
    282     return startY - ofs.y;
    283 }
    284 
    285 
    286 int FilterEngine::remainingInputRows() const
    287 {
    288     return endY - startY - rowCount;
    289 }
    290 
    291 int FilterEngine::remainingOutputRows() const
    292 {
    293     return roi.height - dstY;
    294 }
    295 
    296 int FilterEngine::proceed( const uchar* src, int srcstep, int count,
    297                            uchar* dst, int dststep )
    298 {
    299     CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
    300 
    301     const int *btab = &borderTab[0];
    302     int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
    303     uchar** brows = &rows[0];
    304     int bufRows = (int)rows.size();
    305     int cn = CV_MAT_CN(bufType);
    306     int width = roi.width, kwidth = ksize.width;
    307     int kheight = ksize.height, ay = anchor.y;
    308     int _dx1 = dx1, _dx2 = dx2;
    309     int width1 = roi.width + kwidth - 1;
    310     int xofs1 = std::min(roi.x, anchor.x);
    311     bool isSep = isSeparable();
    312     bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
    313     int dy = 0, i = 0;
    314 
    315     src -= xofs1*esz;
    316     count = std::min(count, remainingInputRows());
    317 
    318     CV_Assert( src && dst && count > 0 );
    319 
    320     for(;; dst += dststep*i, dy += i)
    321     {
    322         int dcount = bufRows - ay - startY - rowCount + roi.y;
    323         dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
    324         dcount = std::min(dcount, count);
    325         count -= dcount;
    326         for( ; dcount-- > 0; src += srcstep )
    327         {
    328             int bi = (startY - startY0 + rowCount) % bufRows;
    329             uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
    330             uchar* row = isSep ? &srcRow[0] : brow;
    331 
    332             if( ++rowCount > bufRows )
    333             {
    334                 --rowCount;
    335                 ++startY;
    336             }
    337 
    338             memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
    339 
    340             if( makeBorder )
    341             {
    342                 if( btab_esz*(int)sizeof(int) == esz )
    343                 {
    344                     const int* isrc = (const int*)src;
    345                     int* irow = (int*)row;
    346 
    347                     for( i = 0; i < _dx1*btab_esz; i++ )
    348                         irow[i] = isrc[btab[i]];
    349                     for( i = 0; i < _dx2*btab_esz; i++ )
    350                         irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
    351                 }
    352                 else
    353                 {
    354                     for( i = 0; i < _dx1*esz; i++ )
    355                         row[i] = src[btab[i]];
    356                     for( i = 0; i < _dx2*esz; i++ )
    357                         row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
    358                 }
    359             }
    360 
    361             if( isSep )
    362                 (*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
    363         }
    364 
    365         int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
    366         for( i = 0; i < max_i; i++ )
    367         {
    368             int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
    369                             wholeSize.height, columnBorderType);
    370             if( srcY < 0 ) // can happen only with constant border type
    371                 brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
    372             else
    373             {
    374                 CV_Assert( srcY >= startY );
    375                 if( srcY >= startY + rowCount )
    376                     break;
    377                 int bi = (srcY - startY0) % bufRows;
    378                 brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
    379             }
    380         }
    381         if( i < kheight )
    382             break;
    383         i -= kheight - 1;
    384         if( isSeparable() )
    385             (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
    386         else
    387             (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
    388     }
    389 
    390     dstY += dy;
    391     CV_Assert( dstY <= roi.height );
    392     return dy;
    393 }
    394 
    395 
    396 void FilterEngine::apply(const Mat& src, Mat& dst,
    397     const Rect& _srcRoi, Point dstOfs, bool isolated)
    398 {
    399     CV_Assert( src.type() == srcType && dst.type() == dstType );
    400 
    401     Rect srcRoi = _srcRoi;
    402     if( srcRoi == Rect(0,0,-1,-1) )
    403         srcRoi = Rect(0,0,src.cols,src.rows);
    404 
    405     if( srcRoi.area() == 0 )
    406         return;
    407 
    408     CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
    409         dstOfs.x + srcRoi.width <= dst.cols &&
    410         dstOfs.y + srcRoi.height <= dst.rows );
    411 
    412     int y = start(src, srcRoi, isolated);
    413     proceed( src.ptr() + y*src.step + srcRoi.x*src.elemSize(),
    414              (int)src.step, endY - startY,
    415              dst.ptr(dstOfs.y) +
    416              dstOfs.x*dst.elemSize(), (int)dst.step );
    417 }
    418 
    419 }
    420 
    421 /****************************************************************************************\
    422 *                                 Separable linear filter                                *
    423 \****************************************************************************************/
    424 
    425 int cv::getKernelType(InputArray filter_kernel, Point anchor)
    426 {
    427     Mat _kernel = filter_kernel.getMat();
    428     CV_Assert( _kernel.channels() == 1 );
    429     int i, sz = _kernel.rows*_kernel.cols;
    430 
    431     Mat kernel;
    432     _kernel.convertTo(kernel, CV_64F);
    433 
    434     const double* coeffs = kernel.ptr<double>();
    435     double sum = 0;
    436     int type = KERNEL_SMOOTH + KERNEL_INTEGER;
    437     if( (_kernel.rows == 1 || _kernel.cols == 1) &&
    438         anchor.x*2 + 1 == _kernel.cols &&
    439         anchor.y*2 + 1 == _kernel.rows )
    440         type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
    441 
    442     for( i = 0; i < sz; i++ )
    443     {
    444         double a = coeffs[i], b = coeffs[sz - i - 1];
    445         if( a != b )
    446             type &= ~KERNEL_SYMMETRICAL;
    447         if( a != -b )
    448             type &= ~KERNEL_ASYMMETRICAL;
    449         if( a < 0 )
    450             type &= ~KERNEL_SMOOTH;
    451         if( a != saturate_cast<int>(a) )
    452             type &= ~KERNEL_INTEGER;
    453         sum += a;
    454     }
    455 
    456     if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
    457         type &= ~KERNEL_SMOOTH;
    458     return type;
    459 }
    460 
    461 
    462 namespace cv
    463 {
    464 
    465 struct RowNoVec
    466 {
    467     RowNoVec() {}
    468     RowNoVec(const Mat&) {}
    469     int operator()(const uchar*, uchar*, int, int) const { return 0; }
    470 };
    471 
    472 struct ColumnNoVec
    473 {
    474     ColumnNoVec() {}
    475     ColumnNoVec(const Mat&, int, int, double) {}
    476     int operator()(const uchar**, uchar*, int) const { return 0; }
    477 };
    478 
    479 struct SymmRowSmallNoVec
    480 {
    481     SymmRowSmallNoVec() {}
    482     SymmRowSmallNoVec(const Mat&, int) {}
    483     int operator()(const uchar*, uchar*, int, int) const { return 0; }
    484 };
    485 
    486 struct SymmColumnSmallNoVec
    487 {
    488     SymmColumnSmallNoVec() {}
    489     SymmColumnSmallNoVec(const Mat&, int, int, double) {}
    490     int operator()(const uchar**, uchar*, int) const { return 0; }
    491 };
    492 
    493 struct FilterNoVec
    494 {
    495     FilterNoVec() {}
    496     FilterNoVec(const Mat&, int, double) {}
    497     int operator()(const uchar**, uchar*, int) const { return 0; }
    498 };
    499 
    500 
    501 #if CV_SSE2
    502 
    503 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
    504 
    505 struct RowVec_8u32s
    506 {
    507     RowVec_8u32s() { smallValues = false; }
    508     RowVec_8u32s( const Mat& _kernel )
    509     {
    510         kernel = _kernel;
    511         smallValues = true;
    512         int k, ksize = kernel.rows + kernel.cols - 1;
    513         for( k = 0; k < ksize; k++ )
    514         {
    515             int v = kernel.ptr<int>()[k];
    516             if( v < SHRT_MIN || v > SHRT_MAX )
    517             {
    518                 smallValues = false;
    519                 break;
    520             }
    521         }
    522     }
    523 
    524     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
    525     {
    526         if( !checkHardwareSupport(CV_CPU_SSE2) )
    527             return 0;
    528 
    529         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
    530         int* dst = (int*)_dst;
    531         const int* _kx = kernel.ptr<int>();
    532         width *= cn;
    533 
    534         if( smallValues )
    535         {
    536             for( ; i <= width - 16; i += 16 )
    537             {
    538                 const uchar* src = _src + i;
    539                 __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
    540                 __m128i x0, x1, x2, x3;
    541 
    542                 for( k = 0; k < _ksize; k++, src += cn )
    543                 {
    544                     f = _mm_cvtsi32_si128(_kx[k]);
    545                     f = _mm_shuffle_epi32(f, 0);
    546                     f = _mm_packs_epi32(f, f);
    547 
    548                     x0 = _mm_loadu_si128((const __m128i*)src);
    549                     x2 = _mm_unpackhi_epi8(x0, z);
    550                     x0 = _mm_unpacklo_epi8(x0, z);
    551                     x1 = _mm_mulhi_epi16(x0, f);
    552                     x3 = _mm_mulhi_epi16(x2, f);
    553                     x0 = _mm_mullo_epi16(x0, f);
    554                     x2 = _mm_mullo_epi16(x2, f);
    555 
    556                     s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
    557                     s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
    558                     s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
    559                     s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
    560                 }
    561 
    562                 _mm_store_si128((__m128i*)(dst + i), s0);
    563                 _mm_store_si128((__m128i*)(dst + i + 4), s1);
    564                 _mm_store_si128((__m128i*)(dst + i + 8), s2);
    565                 _mm_store_si128((__m128i*)(dst + i + 12), s3);
    566             }
    567 
    568             for( ; i <= width - 4; i += 4 )
    569             {
    570                 const uchar* src = _src + i;
    571                 __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
    572 
    573                 for( k = 0; k < _ksize; k++, src += cn )
    574                 {
    575                     f = _mm_cvtsi32_si128(_kx[k]);
    576                     f = _mm_shuffle_epi32(f, 0);
    577                     f = _mm_packs_epi32(f, f);
    578 
    579                     x0 = _mm_cvtsi32_si128(*(const int*)src);
    580                     x0 = _mm_unpacklo_epi8(x0, z);
    581                     x1 = _mm_mulhi_epi16(x0, f);
    582                     x0 = _mm_mullo_epi16(x0, f);
    583                     s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
    584                 }
    585                 _mm_store_si128((__m128i*)(dst + i), s0);
    586             }
    587         }
    588         return i;
    589     }
    590 
    591     Mat kernel;
    592     bool smallValues;
    593 };
    594 
    595 
    596 struct SymmRowSmallVec_8u32s
    597 {
    598     SymmRowSmallVec_8u32s() { smallValues = false; }
    599     SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
    600     {
    601         kernel = _kernel;
    602         symmetryType = _symmetryType;
    603         smallValues = true;
    604         int k, ksize = kernel.rows + kernel.cols - 1;
    605         for( k = 0; k < ksize; k++ )
    606         {
    607             int v = kernel.ptr<int>()[k];
    608             if( v < SHRT_MIN || v > SHRT_MAX )
    609             {
    610                 smallValues = false;
    611                 break;
    612             }
    613         }
    614     }
    615 
    616     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
    617     {
    618         if( !checkHardwareSupport(CV_CPU_SSE2) )
    619             return 0;
    620 
    621         int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
    622         int* dst = (int*)_dst;
    623         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
    624         const int* kx = kernel.ptr<int>() + _ksize/2;
    625         if( !smallValues )
    626             return 0;
    627 
    628         src += (_ksize/2)*cn;
    629         width *= cn;
    630 
    631         __m128i z = _mm_setzero_si128();
    632         if( symmetrical )
    633         {
    634             if( _ksize == 1 )
    635                 return 0;
    636             if( _ksize == 3 )
    637             {
    638                 if( kx[0] == 2 && kx[1] == 1 )
    639                     for( ; i <= width - 16; i += 16, src += 16 )
    640                     {
    641                         __m128i x0, x1, x2, y0, y1, y2;
    642                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
    643                         x1 = _mm_loadu_si128((__m128i*)src);
    644                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
    645                         y0 = _mm_unpackhi_epi8(x0, z);
    646                         x0 = _mm_unpacklo_epi8(x0, z);
    647                         y1 = _mm_unpackhi_epi8(x1, z);
    648                         x1 = _mm_unpacklo_epi8(x1, z);
    649                         y2 = _mm_unpackhi_epi8(x2, z);
    650                         x2 = _mm_unpacklo_epi8(x2, z);
    651                         x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
    652                         y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
    653                         _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
    654                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
    655                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
    656                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
    657                     }
    658                 else if( kx[0] == -2 && kx[1] == 1 )
    659                     for( ; i <= width - 16; i += 16, src += 16 )
    660                     {
    661                         __m128i x0, x1, x2, y0, y1, y2;
    662                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
    663                         x1 = _mm_loadu_si128((__m128i*)src);
    664                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
    665                         y0 = _mm_unpackhi_epi8(x0, z);
    666                         x0 = _mm_unpacklo_epi8(x0, z);
    667                         y1 = _mm_unpackhi_epi8(x1, z);
    668                         x1 = _mm_unpacklo_epi8(x1, z);
    669                         y2 = _mm_unpackhi_epi8(x2, z);
    670                         x2 = _mm_unpacklo_epi8(x2, z);
    671                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
    672                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
    673                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
    674                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
    675                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
    676                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
    677                     }
    678                 else
    679                 {
    680                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
    681                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
    682                     k0 = _mm_packs_epi32(k0, k0);
    683                     k1 = _mm_packs_epi32(k1, k1);
    684 
    685                     for( ; i <= width - 16; i += 16, src += 16 )
    686                     {
    687                         __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
    688                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
    689                         x1 = _mm_loadu_si128((__m128i*)src);
    690                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
    691                         y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
    692                         x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
    693                         y1 = _mm_unpackhi_epi8(x1, z);
    694                         x1 = _mm_unpacklo_epi8(x1, z);
    695 
    696                         t1 = _mm_mulhi_epi16(x1, k0);
    697                         t0 = _mm_mullo_epi16(x1, k0);
    698                         x2 = _mm_mulhi_epi16(x0, k1);
    699                         x0 = _mm_mullo_epi16(x0, k1);
    700                         z0 = _mm_unpacklo_epi16(t0, t1);
    701                         z1 = _mm_unpackhi_epi16(t0, t1);
    702                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
    703                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
    704 
    705                         t1 = _mm_mulhi_epi16(y1, k0);
    706                         t0 = _mm_mullo_epi16(y1, k0);
    707                         y1 = _mm_mulhi_epi16(y0, k1);
    708                         y0 = _mm_mullo_epi16(y0, k1);
    709                         z2 = _mm_unpacklo_epi16(t0, t1);
    710                         z3 = _mm_unpackhi_epi16(t0, t1);
    711                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
    712                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
    713                         _mm_store_si128((__m128i*)(dst + i), z0);
    714                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
    715                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
    716                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
    717                     }
    718                 }
    719             }
    720             else if( _ksize == 5 )
    721             {
    722                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
    723                     for( ; i <= width - 16; i += 16, src += 16 )
    724                     {
    725                         __m128i x0, x1, x2, y0, y1, y2;
    726                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
    727                         x1 = _mm_loadu_si128((__m128i*)src);
    728                         x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
    729                         y0 = _mm_unpackhi_epi8(x0, z);
    730                         x0 = _mm_unpacklo_epi8(x0, z);
    731                         y1 = _mm_unpackhi_epi8(x1, z);
    732                         x1 = _mm_unpacklo_epi8(x1, z);
    733                         y2 = _mm_unpackhi_epi8(x2, z);
    734                         x2 = _mm_unpacklo_epi8(x2, z);
    735                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
    736                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
    737                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
    738                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
    739                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
    740                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
    741                     }
    742                 else
    743                 {
    744                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
    745                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
    746                             k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
    747                     k0 = _mm_packs_epi32(k0, k0);
    748                     k1 = _mm_packs_epi32(k1, k1);
    749                     k2 = _mm_packs_epi32(k2, k2);
    750 
    751                     for( ; i <= width - 16; i += 16, src += 16 )
    752                     {
    753                         __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
    754                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
    755                         x1 = _mm_loadu_si128((__m128i*)src);
    756                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
    757                         y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
    758                         x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
    759                         y1 = _mm_unpackhi_epi8(x1, z);
    760                         x1 = _mm_unpacklo_epi8(x1, z);
    761 
    762                         t1 = _mm_mulhi_epi16(x1, k0);
    763                         t0 = _mm_mullo_epi16(x1, k0);
    764                         x2 = _mm_mulhi_epi16(x0, k1);
    765                         x0 = _mm_mullo_epi16(x0, k1);
    766                         z0 = _mm_unpacklo_epi16(t0, t1);
    767                         z1 = _mm_unpackhi_epi16(t0, t1);
    768                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
    769                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
    770 
    771                         t1 = _mm_mulhi_epi16(y1, k0);
    772                         t0 = _mm_mullo_epi16(y1, k0);
    773                         y1 = _mm_mulhi_epi16(y0, k1);
    774                         y0 = _mm_mullo_epi16(y0, k1);
    775                         z2 = _mm_unpacklo_epi16(t0, t1);
    776                         z3 = _mm_unpackhi_epi16(t0, t1);
    777                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
    778                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
    779 
    780                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
    781                         x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
    782                         y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
    783                         y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
    784 
    785                         t1 = _mm_mulhi_epi16(y0, k2);
    786                         t0 = _mm_mullo_epi16(y0, k2);
    787                         y0 = _mm_mullo_epi16(y1, k2);
    788                         y1 = _mm_mulhi_epi16(y1, k2);
    789                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
    790                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
    791                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
    792                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
    793 
    794                         _mm_store_si128((__m128i*)(dst + i), z0);
    795                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
    796                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
    797                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
    798                     }
    799                 }
    800             }
    801         }
    802         else
    803         {
    804             if( _ksize == 3 )
    805             {
    806                 if( kx[0] == 0 && kx[1] == 1 )
    807                     for( ; i <= width - 16; i += 16, src += 16 )
    808                     {
    809                         __m128i x0, x1, y0;
    810                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
    811                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
    812                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
    813                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
    814                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
    815                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
    816                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
    817                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
    818                     }
    819                 else
    820                 {
    821                     __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
    822                     k1 = _mm_packs_epi32(k1, k1);
    823 
    824                     for( ; i <= width - 16; i += 16, src += 16 )
    825                     {
    826                         __m128i x0, x1, y0, y1, z0, z1, z2, z3;
    827                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
    828                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
    829                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
    830                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
    831 
    832                         x1 = _mm_mulhi_epi16(x0, k1);
    833                         x0 = _mm_mullo_epi16(x0, k1);
    834                         z0 = _mm_unpacklo_epi16(x0, x1);
    835                         z1 = _mm_unpackhi_epi16(x0, x1);
    836 
    837                         y1 = _mm_mulhi_epi16(y0, k1);
    838                         y0 = _mm_mullo_epi16(y0, k1);
    839                         z2 = _mm_unpacklo_epi16(y0, y1);
    840                         z3 = _mm_unpackhi_epi16(y0, y1);
    841                         _mm_store_si128((__m128i*)(dst + i), z0);
    842                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
    843                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
    844                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
    845                     }
    846                 }
    847             }
    848             else if( _ksize == 5 )
    849             {
    850                 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
    851                         k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
    852                         k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
    853                 k0 = _mm_packs_epi32(k0, k0);
    854                 k1 = _mm_packs_epi32(k1, k1);
    855                 k2 = _mm_packs_epi32(k2, k2);
    856 
    857                 for( ; i <= width - 16; i += 16, src += 16 )
    858                 {
    859                     __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
    860                     x0 = _mm_loadu_si128((__m128i*)(src + cn));
    861                     x2 = _mm_loadu_si128((__m128i*)(src - cn));
    862                     y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
    863                     x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
    864 
    865                     x2 = _mm_mulhi_epi16(x0, k1);
    866                     x0 = _mm_mullo_epi16(x0, k1);
    867                     z0 = _mm_unpacklo_epi16(x0, x2);
    868                     z1 = _mm_unpackhi_epi16(x0, x2);
    869                     y1 = _mm_mulhi_epi16(y0, k1);
    870                     y0 = _mm_mullo_epi16(y0, k1);
    871                     z2 = _mm_unpacklo_epi16(y0, y1);
    872                     z3 = _mm_unpackhi_epi16(y0, y1);
    873 
    874                     x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
    875                     x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
    876                     y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
    877                     y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
    878 
    879                     t1 = _mm_mulhi_epi16(y0, k2);
    880                     t0 = _mm_mullo_epi16(y0, k2);
    881                     y0 = _mm_mullo_epi16(y1, k2);
    882                     y1 = _mm_mulhi_epi16(y1, k2);
    883                     z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
    884                     z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
    885                     z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
    886                     z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
    887 
    888                     _mm_store_si128((__m128i*)(dst + i), z0);
    889                     _mm_store_si128((__m128i*)(dst + i + 4), z1);
    890                     _mm_store_si128((__m128i*)(dst + i + 8), z2);
    891                     _mm_store_si128((__m128i*)(dst + i + 12), z3);
    892                 }
    893             }
    894         }
    895 
    896         src -= (_ksize/2)*cn;
    897         kx -= _ksize/2;
    898         for( ; i <= width - 4; i += 4, src += 4 )
    899         {
    900             __m128i f, s0 = z, x0, x1;
    901 
    902             for( k = j = 0; k < _ksize; k++, j += cn )
    903             {
    904                 f = _mm_cvtsi32_si128(kx[k]);
    905                 f = _mm_shuffle_epi32(f, 0);
    906                 f = _mm_packs_epi32(f, f);
    907 
    908                 x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
    909                 x0 = _mm_unpacklo_epi8(x0, z);
    910                 x1 = _mm_mulhi_epi16(x0, f);
    911                 x0 = _mm_mullo_epi16(x0, f);
    912                 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
    913             }
    914             _mm_store_si128((__m128i*)(dst + i), s0);
    915         }
    916 
    917         return i;
    918     }
    919 
    920     Mat kernel;
    921     int symmetryType;
    922     bool smallValues;
    923 };
    924 
    925 
    926 struct SymmColumnVec_32s8u
    927 {
    928     SymmColumnVec_32s8u() { symmetryType=0; }
    929     SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
    930     {
    931         symmetryType = _symmetryType;
    932         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
    933         delta = (float)(_delta/(1 << _bits));
    934         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
    935     }
    936 
    937     int operator()(const uchar** _src, uchar* dst, int width) const
    938     {
    939         if( !checkHardwareSupport(CV_CPU_SSE2) )
    940             return 0;
    941 
    942         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
    943         const float* ky = kernel.ptr<float>() + ksize2;
    944         int i = 0, k;
    945         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
    946         const int** src = (const int**)_src;
    947         const __m128i *S, *S2;
    948         __m128 d4 = _mm_set1_ps(delta);
    949 
    950         if( symmetrical )
    951         {
    952             for( ; i <= width - 16; i += 16 )
    953             {
    954                 __m128 f = _mm_load_ss(ky);
    955                 f = _mm_shuffle_ps(f, f, 0);
    956                 __m128 s0, s1, s2, s3;
    957                 __m128i x0, x1;
    958                 S = (const __m128i*)(src[0] + i);
    959                 s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
    960                 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
    961                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
    962                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
    963                 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
    964                 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
    965                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
    966                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
    967 
    968                 for( k = 1; k <= ksize2; k++ )
    969                 {
    970                     S = (const __m128i*)(src[k] + i);
    971                     S2 = (const __m128i*)(src[-k] + i);
    972                     f = _mm_load_ss(ky+k);
    973                     f = _mm_shuffle_ps(f, f, 0);
    974                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
    975                     x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
    976                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
    977                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
    978                     x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
    979                     x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
    980                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
    981                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
    982                 }
    983 
    984                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
    985                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
    986                 x0 = _mm_packus_epi16(x0, x1);
    987                 _mm_storeu_si128((__m128i*)(dst + i), x0);
    988             }
    989 
    990             for( ; i <= width - 4; i += 4 )
    991             {
    992                 __m128 f = _mm_load_ss(ky);
    993                 f = _mm_shuffle_ps(f, f, 0);
    994                 __m128i x0;
    995                 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
    996                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
    997 
    998                 for( k = 1; k <= ksize2; k++ )
    999                 {
   1000                     S = (const __m128i*)(src[k] + i);
   1001                     S2 = (const __m128i*)(src[-k] + i);
   1002                     f = _mm_load_ss(ky+k);
   1003                     f = _mm_shuffle_ps(f, f, 0);
   1004                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
   1005                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
   1006                 }
   1007 
   1008                 x0 = _mm_cvtps_epi32(s0);
   1009                 x0 = _mm_packs_epi32(x0, x0);
   1010                 x0 = _mm_packus_epi16(x0, x0);
   1011                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
   1012             }
   1013         }
   1014         else
   1015         {
   1016             for( ; i <= width - 16; i += 16 )
   1017             {
   1018                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
   1019                 __m128i x0, x1;
   1020 
   1021                 for( k = 1; k <= ksize2; k++ )
   1022                 {
   1023                     S = (const __m128i*)(src[k] + i);
   1024                     S2 = (const __m128i*)(src[-k] + i);
   1025                     f = _mm_load_ss(ky+k);
   1026                     f = _mm_shuffle_ps(f, f, 0);
   1027                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
   1028                     x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
   1029                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
   1030                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
   1031                     x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
   1032                     x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
   1033                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
   1034                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
   1035                 }
   1036 
   1037                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
   1038                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
   1039                 x0 = _mm_packus_epi16(x0, x1);
   1040                 _mm_storeu_si128((__m128i*)(dst + i), x0);
   1041             }
   1042 
   1043             for( ; i <= width - 4; i += 4 )
   1044             {
   1045                 __m128 f, s0 = d4;
   1046                 __m128i x0;
   1047 
   1048                 for( k = 1; k <= ksize2; k++ )
   1049                 {
   1050                     S = (const __m128i*)(src[k] + i);
   1051                     S2 = (const __m128i*)(src[-k] + i);
   1052                     f = _mm_load_ss(ky+k);
   1053                     f = _mm_shuffle_ps(f, f, 0);
   1054                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
   1055                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
   1056                 }
   1057 
   1058                 x0 = _mm_cvtps_epi32(s0);
   1059                 x0 = _mm_packs_epi32(x0, x0);
   1060                 x0 = _mm_packus_epi16(x0, x0);
   1061                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
   1062             }
   1063         }
   1064 
   1065         return i;
   1066     }
   1067 
   1068     int symmetryType;
   1069     float delta;
   1070     Mat kernel;
   1071 };
   1072 
   1073 
   1074 struct SymmColumnSmallVec_32s16s
   1075 {
   1076     SymmColumnSmallVec_32s16s() { symmetryType=0; }
   1077     SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
   1078     {
   1079         symmetryType = _symmetryType;
   1080         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
   1081         delta = (float)(_delta/(1 << _bits));
   1082         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
   1083     }
   1084 
   1085     int operator()(const uchar** _src, uchar* _dst, int width) const
   1086     {
   1087         if( !checkHardwareSupport(CV_CPU_SSE2) )
   1088             return 0;
   1089 
   1090         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
   1091         const float* ky = kernel.ptr<float>() + ksize2;
   1092         int i = 0;
   1093         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   1094         const int** src = (const int**)_src;
   1095         const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
   1096         short* dst = (short*)_dst;
   1097         __m128 df4 = _mm_set1_ps(delta);
   1098         __m128i d4 = _mm_cvtps_epi32(df4);
   1099 
   1100         if( symmetrical )
   1101         {
   1102             if( ky[0] == 2 && ky[1] == 1 )
   1103             {
   1104                 for( ; i <= width - 8; i += 8 )
   1105                 {
   1106                     __m128i s0, s1, s2, s3, s4, s5;
   1107                     s0 = _mm_load_si128((__m128i*)(S0 + i));
   1108                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
   1109                     s2 = _mm_load_si128((__m128i*)(S1 + i));
   1110                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
   1111                     s4 = _mm_load_si128((__m128i*)(S2 + i));
   1112                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
   1113                     s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
   1114                     s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
   1115                     s0 = _mm_add_epi32(s0, d4);
   1116                     s1 = _mm_add_epi32(s1, d4);
   1117                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
   1118                 }
   1119             }
   1120             else if( ky[0] == -2 && ky[1] == 1 )
   1121             {
   1122                 for( ; i <= width - 8; i += 8 )
   1123                 {
   1124                     __m128i s0, s1, s2, s3, s4, s5;
   1125                     s0 = _mm_load_si128((__m128i*)(S0 + i));
   1126                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
   1127                     s2 = _mm_load_si128((__m128i*)(S1 + i));
   1128                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
   1129                     s4 = _mm_load_si128((__m128i*)(S2 + i));
   1130                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
   1131                     s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
   1132                     s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
   1133                     s0 = _mm_add_epi32(s0, d4);
   1134                     s1 = _mm_add_epi32(s1, d4);
   1135                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
   1136                 }
   1137             }
   1138             else
   1139             {
   1140                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
   1141                 for( ; i <= width - 8; i += 8 )
   1142                 {
   1143                     __m128 s0, s1;
   1144                     s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
   1145                     s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
   1146                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
   1147                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
   1148                     __m128i x0, x1;
   1149                     x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
   1150                                        _mm_load_si128((__m128i*)(S2 + i)));
   1151                     x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
   1152                                        _mm_load_si128((__m128i*)(S2 + i + 4)));
   1153                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
   1154                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
   1155                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
   1156                     _mm_storeu_si128((__m128i*)(dst + i), x0);
   1157                 }
   1158             }
   1159         }
   1160         else
   1161         {
   1162             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
   1163             {
   1164                 if( ky[1] < 0 )
   1165                     std::swap(S0, S2);
   1166                 for( ; i <= width - 8; i += 8 )
   1167                 {
   1168                     __m128i s0, s1, s2, s3;
   1169                     s0 = _mm_load_si128((__m128i*)(S2 + i));
   1170                     s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
   1171                     s2 = _mm_load_si128((__m128i*)(S0 + i));
   1172                     s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
   1173                     s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
   1174                     s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
   1175                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
   1176                 }
   1177             }
   1178             else
   1179             {
   1180                 __m128 k1 = _mm_set1_ps(ky[1]);
   1181                 for( ; i <= width - 8; i += 8 )
   1182                 {
   1183                     __m128 s0 = df4, s1 = df4;
   1184                     __m128i x0, x1;
   1185                     x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)),
   1186                                        _mm_load_si128((__m128i*)(S0 + i)));
   1187                     x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)),
   1188                                        _mm_load_si128((__m128i*)(S0 + i + 4)));
   1189                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
   1190                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
   1191                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
   1192                     _mm_storeu_si128((__m128i*)(dst + i), x0);
   1193                 }
   1194             }
   1195         }
   1196 
   1197         return i;
   1198     }
   1199 
   1200     int symmetryType;
   1201     float delta;
   1202     Mat kernel;
   1203 };
   1204 
   1205 
   1206 /////////////////////////////////////// 16s //////////////////////////////////
   1207 
   1208 struct RowVec_16s32f
   1209 {
   1210     RowVec_16s32f() {}
   1211     RowVec_16s32f( const Mat& _kernel )
   1212     {
   1213         kernel = _kernel;
   1214         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
   1215     }
   1216 
   1217     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
   1218     {
   1219         if( !sse2_supported )
   1220             return 0;
   1221 
   1222         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
   1223         float* dst = (float*)_dst;
   1224         const float* _kx = kernel.ptr<float>();
   1225         width *= cn;
   1226 
   1227         for( ; i <= width - 8; i += 8 )
   1228         {
   1229             const short* src = (const short*)_src + i;
   1230             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
   1231             for( k = 0; k < _ksize; k++, src += cn )
   1232             {
   1233                 f = _mm_load_ss(_kx+k);
   1234                 f = _mm_shuffle_ps(f, f, 0);
   1235 
   1236                 __m128i x0i = _mm_loadu_si128((const __m128i*)src);
   1237                 __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
   1238                 x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
   1239                 x0 = _mm_cvtepi32_ps(x0i);
   1240                 x1 = _mm_cvtepi32_ps(x1i);
   1241                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1242                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
   1243             }
   1244             _mm_store_ps(dst + i, s0);
   1245             _mm_store_ps(dst + i + 4, s1);
   1246         }
   1247         return i;
   1248     }
   1249 
   1250     Mat kernel;
   1251     bool sse2_supported;
   1252 };
   1253 
   1254 
   1255 struct SymmColumnVec_32f16s
   1256 {
   1257     SymmColumnVec_32f16s() { symmetryType=0; }
   1258     SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
   1259     {
   1260         symmetryType = _symmetryType;
   1261         kernel = _kernel;
   1262         delta = (float)_delta;
   1263         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
   1264         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
   1265     }
   1266 
   1267     int operator()(const uchar** _src, uchar* _dst, int width) const
   1268     {
   1269         if( !sse2_supported )
   1270             return 0;
   1271 
   1272         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
   1273         const float* ky = kernel.ptr<float>() + ksize2;
   1274         int i = 0, k;
   1275         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   1276         const float** src = (const float**)_src;
   1277         const float *S, *S2;
   1278         short* dst = (short*)_dst;
   1279         __m128 d4 = _mm_set1_ps(delta);
   1280 
   1281         if( symmetrical )
   1282         {
   1283             for( ; i <= width - 16; i += 16 )
   1284             {
   1285                 __m128 f = _mm_load_ss(ky);
   1286                 f = _mm_shuffle_ps(f, f, 0);
   1287                 __m128 s0, s1, s2, s3;
   1288                 __m128 x0, x1;
   1289                 S = src[0] + i;
   1290                 s0 = _mm_load_ps(S);
   1291                 s1 = _mm_load_ps(S+4);
   1292                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
   1293                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
   1294                 s2 = _mm_load_ps(S+8);
   1295                 s3 = _mm_load_ps(S+12);
   1296                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
   1297                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
   1298 
   1299                 for( k = 1; k <= ksize2; k++ )
   1300                 {
   1301                     S = src[k] + i;
   1302                     S2 = src[-k] + i;
   1303                     f = _mm_load_ss(ky+k);
   1304                     f = _mm_shuffle_ps(f, f, 0);
   1305                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
   1306                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
   1307                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1308                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
   1309                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
   1310                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
   1311                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
   1312                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
   1313                 }
   1314 
   1315                 __m128i s0i = _mm_cvtps_epi32(s0);
   1316                 __m128i s1i = _mm_cvtps_epi32(s1);
   1317                 __m128i s2i = _mm_cvtps_epi32(s2);
   1318                 __m128i s3i = _mm_cvtps_epi32(s3);
   1319 
   1320                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
   1321                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
   1322             }
   1323 
   1324             for( ; i <= width - 4; i += 4 )
   1325             {
   1326                 __m128 f = _mm_load_ss(ky);
   1327                 f = _mm_shuffle_ps(f, f, 0);
   1328                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
   1329                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
   1330 
   1331                 for( k = 1; k <= ksize2; k++ )
   1332                 {
   1333                     f = _mm_load_ss(ky+k);
   1334                     f = _mm_shuffle_ps(f, f, 0);
   1335                     S = src[k] + i;
   1336                     S2 = src[-k] + i;
   1337                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
   1338                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1339                 }
   1340 
   1341                 __m128i s0i = _mm_cvtps_epi32(s0);
   1342                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
   1343             }
   1344         }
   1345         else
   1346         {
   1347             for( ; i <= width - 16; i += 16 )
   1348             {
   1349                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
   1350                 __m128 x0, x1;
   1351                 S = src[0] + i;
   1352 
   1353                 for( k = 1; k <= ksize2; k++ )
   1354                 {
   1355                     S = src[k] + i;
   1356                     S2 = src[-k] + i;
   1357                     f = _mm_load_ss(ky+k);
   1358                     f = _mm_shuffle_ps(f, f, 0);
   1359                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
   1360                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
   1361                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1362                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
   1363                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
   1364                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
   1365                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
   1366                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
   1367                 }
   1368 
   1369                 __m128i s0i = _mm_cvtps_epi32(s0);
   1370                 __m128i s1i = _mm_cvtps_epi32(s1);
   1371                 __m128i s2i = _mm_cvtps_epi32(s2);
   1372                 __m128i s3i = _mm_cvtps_epi32(s3);
   1373 
   1374                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
   1375                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
   1376             }
   1377 
   1378             for( ; i <= width - 4; i += 4 )
   1379             {
   1380                 __m128 f, x0, s0 = d4;
   1381 
   1382                 for( k = 1; k <= ksize2; k++ )
   1383                 {
   1384                     f = _mm_load_ss(ky+k);
   1385                     f = _mm_shuffle_ps(f, f, 0);
   1386                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
   1387                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1388                 }
   1389 
   1390                 __m128i s0i = _mm_cvtps_epi32(s0);
   1391                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
   1392             }
   1393         }
   1394 
   1395         return i;
   1396     }
   1397 
   1398     int symmetryType;
   1399     float delta;
   1400     Mat kernel;
   1401     bool sse2_supported;
   1402 };
   1403 
   1404 
   1405 /////////////////////////////////////// 32f //////////////////////////////////
   1406 
   1407 struct RowVec_32f
   1408 {
   1409     RowVec_32f()
   1410     {
   1411         haveSSE = checkHardwareSupport(CV_CPU_SSE);
   1412     }
   1413 
   1414     RowVec_32f( const Mat& _kernel )
   1415     {
   1416         kernel = _kernel;
   1417         haveSSE = checkHardwareSupport(CV_CPU_SSE);
   1418 #if defined USE_IPP_SEP_FILTERS && 0
   1419         bufsz = -1;
   1420 #endif
   1421     }
   1422 
   1423     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
   1424     {
   1425 #if defined USE_IPP_SEP_FILTERS && 0
   1426         CV_IPP_CHECK()
   1427         {
   1428             int ret = ippiOperator(_src, _dst, width, cn);
   1429             if (ret > 0)
   1430                 return ret;
   1431         }
   1432 #endif
   1433         int _ksize = kernel.rows + kernel.cols - 1;
   1434         const float* src0 = (const float*)_src;
   1435         float* dst = (float*)_dst;
   1436         const float* _kx = kernel.ptr<float>();
   1437 
   1438         if( !haveSSE )
   1439             return 0;
   1440 
   1441         int i = 0, k;
   1442         width *= cn;
   1443 
   1444         for( ; i <= width - 8; i += 8 )
   1445         {
   1446             const float* src = src0 + i;
   1447             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
   1448             for( k = 0; k < _ksize; k++, src += cn )
   1449             {
   1450                 f = _mm_load_ss(_kx+k);
   1451                 f = _mm_shuffle_ps(f, f, 0);
   1452 
   1453                 x0 = _mm_loadu_ps(src);
   1454                 x1 = _mm_loadu_ps(src + 4);
   1455                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1456                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
   1457             }
   1458             _mm_store_ps(dst + i, s0);
   1459             _mm_store_ps(dst + i + 4, s1);
   1460         }
   1461         return i;
   1462     }
   1463 
   1464     Mat kernel;
   1465     bool haveSSE;
   1466 #if defined USE_IPP_SEP_FILTERS && 0
   1467 private:
   1468     mutable int bufsz;
   1469     int ippiOperator(const uchar* _src, uchar* _dst, int width, int cn) const
   1470     {
   1471         int _ksize = kernel.rows + kernel.cols - 1;
   1472         if ((1 != cn && 3 != cn) || width < _ksize*8)
   1473             return 0;
   1474 
   1475         const float* src = (const float*)_src;
   1476         float* dst = (float*)_dst;
   1477         const float* _kx = (const float*)kernel.data;
   1478 
   1479         IppiSize roisz = { width, 1 };
   1480         if( bufsz < 0 )
   1481         {
   1482             if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) ||
   1483                 (cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0))
   1484                 return 0;
   1485         }
   1486         AutoBuffer<uchar> buf(bufsz + 64);
   1487         uchar* bufptr = alignPtr((uchar*)buf, 32);
   1488         int step = (int)(width*sizeof(dst[0])*cn);
   1489         float borderValue[] = {0.f, 0.f, 0.f};
   1490         // here is the trick. IPP needs border type and extrapolates the row. We did it already.
   1491         // So we pass anchor=0 and ignore the right tail of results since they are incorrect there.
   1492         if( (cn == 1 && ippiFilterRowBorderPipeline_32f_C1R(src, step, &dst, roisz, _kx, _ksize, 0,
   1493                                                             ippBorderRepl, borderValue[0], bufptr) < 0) ||
   1494             (cn == 3 && ippiFilterRowBorderPipeline_32f_C3R(src, step, &dst, roisz, _kx, _ksize, 0,
   1495                                                             ippBorderRepl, borderValue, bufptr) < 0))
   1496         {
   1497             setIppErrorStatus();
   1498             return 0;
   1499         }
   1500         CV_IMPL_ADD(CV_IMPL_IPP);
   1501         return width - _ksize + 1;
   1502     }
   1503 #endif
   1504 };
   1505 
   1506 
   1507 struct SymmRowSmallVec_32f
   1508 {
   1509     SymmRowSmallVec_32f() {}
   1510     SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
   1511     {
   1512         kernel = _kernel;
   1513         symmetryType = _symmetryType;
   1514     }
   1515 
   1516     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
   1517     {
   1518         if( !checkHardwareSupport(CV_CPU_SSE) )
   1519             return 0;
   1520 
   1521         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
   1522         float* dst = (float*)_dst;
   1523         const float* src = (const float*)_src + (_ksize/2)*cn;
   1524         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   1525         const float* kx = kernel.ptr<float>() + _ksize/2;
   1526         width *= cn;
   1527 
   1528         if( symmetrical )
   1529         {
   1530             if( _ksize == 1 )
   1531                 return 0;
   1532             if( _ksize == 3 )
   1533             {
   1534                 if( kx[0] == 2 && kx[1] == 1 )
   1535                     for( ; i <= width - 8; i += 8, src += 8 )
   1536                     {
   1537                         __m128 x0, x1, x2, y0, y1, y2;
   1538                         x0 = _mm_loadu_ps(src - cn);
   1539                         x1 = _mm_loadu_ps(src);
   1540                         x2 = _mm_loadu_ps(src + cn);
   1541                         y0 = _mm_loadu_ps(src - cn + 4);
   1542                         y1 = _mm_loadu_ps(src + 4);
   1543                         y2 = _mm_loadu_ps(src + cn + 4);
   1544                         x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
   1545                         y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
   1546                         _mm_store_ps(dst + i, x0);
   1547                         _mm_store_ps(dst + i + 4, y0);
   1548                     }
   1549                 else if( kx[0] == -2 && kx[1] == 1 )
   1550                     for( ; i <= width - 8; i += 8, src += 8 )
   1551                     {
   1552                         __m128 x0, x1, x2, y0, y1, y2;
   1553                         x0 = _mm_loadu_ps(src - cn);
   1554                         x1 = _mm_loadu_ps(src);
   1555                         x2 = _mm_loadu_ps(src + cn);
   1556                         y0 = _mm_loadu_ps(src - cn + 4);
   1557                         y1 = _mm_loadu_ps(src + 4);
   1558                         y2 = _mm_loadu_ps(src + cn + 4);
   1559                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
   1560                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
   1561                         _mm_store_ps(dst + i, x0);
   1562                         _mm_store_ps(dst + i + 4, y0);
   1563                     }
   1564                 else
   1565                 {
   1566                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
   1567                     for( ; i <= width - 8; i += 8, src += 8 )
   1568                     {
   1569                         __m128 x0, x1, x2, y0, y1, y2;
   1570                         x0 = _mm_loadu_ps(src - cn);
   1571                         x1 = _mm_loadu_ps(src);
   1572                         x2 = _mm_loadu_ps(src + cn);
   1573                         y0 = _mm_loadu_ps(src - cn + 4);
   1574                         y1 = _mm_loadu_ps(src + 4);
   1575                         y2 = _mm_loadu_ps(src + cn + 4);
   1576 
   1577                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
   1578                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
   1579                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
   1580                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
   1581                         _mm_store_ps(dst + i, x0);
   1582                         _mm_store_ps(dst + i + 4, y0);
   1583                     }
   1584                 }
   1585             }
   1586             else if( _ksize == 5 )
   1587             {
   1588                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
   1589                     for( ; i <= width - 8; i += 8, src += 8 )
   1590                     {
   1591                         __m128 x0, x1, x2, y0, y1, y2;
   1592                         x0 = _mm_loadu_ps(src - cn*2);
   1593                         x1 = _mm_loadu_ps(src);
   1594                         x2 = _mm_loadu_ps(src + cn*2);
   1595                         y0 = _mm_loadu_ps(src - cn*2 + 4);
   1596                         y1 = _mm_loadu_ps(src + 4);
   1597                         y2 = _mm_loadu_ps(src + cn*2 + 4);
   1598                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
   1599                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
   1600                         _mm_store_ps(dst + i, x0);
   1601                         _mm_store_ps(dst + i + 4, y0);
   1602                     }
   1603                 else
   1604                 {
   1605                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
   1606                     for( ; i <= width - 8; i += 8, src += 8 )
   1607                     {
   1608                         __m128 x0, x1, x2, y0, y1, y2;
   1609                         x0 = _mm_loadu_ps(src - cn);
   1610                         x1 = _mm_loadu_ps(src);
   1611                         x2 = _mm_loadu_ps(src + cn);
   1612                         y0 = _mm_loadu_ps(src - cn + 4);
   1613                         y1 = _mm_loadu_ps(src + 4);
   1614                         y2 = _mm_loadu_ps(src + cn + 4);
   1615 
   1616                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
   1617                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
   1618                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
   1619                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
   1620 
   1621                         x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
   1622                         y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
   1623                         x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
   1624                         y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
   1625 
   1626                         _mm_store_ps(dst + i, x0);
   1627                         _mm_store_ps(dst + i + 4, y0);
   1628                     }
   1629                 }
   1630             }
   1631         }
   1632         else
   1633         {
   1634             if( _ksize == 3 )
   1635             {
   1636                 if( kx[0] == 0 && kx[1] == 1 )
   1637                     for( ; i <= width - 8; i += 8, src += 8 )
   1638                     {
   1639                         __m128 x0, x2, y0, y2;
   1640                         x0 = _mm_loadu_ps(src + cn);
   1641                         x2 = _mm_loadu_ps(src - cn);
   1642                         y0 = _mm_loadu_ps(src + cn + 4);
   1643                         y2 = _mm_loadu_ps(src - cn + 4);
   1644                         x0 = _mm_sub_ps(x0, x2);
   1645                         y0 = _mm_sub_ps(y0, y2);
   1646                         _mm_store_ps(dst + i, x0);
   1647                         _mm_store_ps(dst + i + 4, y0);
   1648                     }
   1649                 else
   1650                 {
   1651                     __m128 k1 = _mm_set1_ps(kx[1]);
   1652                     for( ; i <= width - 8; i += 8, src += 8 )
   1653                     {
   1654                         __m128 x0, x2, y0, y2;
   1655                         x0 = _mm_loadu_ps(src + cn);
   1656                         x2 = _mm_loadu_ps(src - cn);
   1657                         y0 = _mm_loadu_ps(src + cn + 4);
   1658                         y2 = _mm_loadu_ps(src - cn + 4);
   1659 
   1660                         x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
   1661                         y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
   1662                         _mm_store_ps(dst + i, x0);
   1663                         _mm_store_ps(dst + i + 4, y0);
   1664                     }
   1665                 }
   1666             }
   1667             else if( _ksize == 5 )
   1668             {
   1669                 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
   1670                 for( ; i <= width - 8; i += 8, src += 8 )
   1671                 {
   1672                     __m128 x0, x2, y0, y2;
   1673                     x0 = _mm_loadu_ps(src + cn);
   1674                     x2 = _mm_loadu_ps(src - cn);
   1675                     y0 = _mm_loadu_ps(src + cn + 4);
   1676                     y2 = _mm_loadu_ps(src - cn + 4);
   1677 
   1678                     x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
   1679                     y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
   1680 
   1681                     x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
   1682                     y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
   1683                     x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
   1684                     y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
   1685 
   1686                     _mm_store_ps(dst + i, x0);
   1687                     _mm_store_ps(dst + i + 4, y0);
   1688                 }
   1689             }
   1690         }
   1691 
   1692         return i;
   1693     }
   1694 
   1695     Mat kernel;
   1696     int symmetryType;
   1697 };
   1698 
   1699 
   1700 struct SymmColumnVec_32f
   1701 {
   1702     SymmColumnVec_32f() { symmetryType=0; }
   1703     SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
   1704     {
   1705         symmetryType = _symmetryType;
   1706         kernel = _kernel;
   1707         delta = (float)_delta;
   1708         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
   1709     }
   1710 
   1711     int operator()(const uchar** _src, uchar* _dst, int width) const
   1712     {
   1713         if( !checkHardwareSupport(CV_CPU_SSE) )
   1714             return 0;
   1715 
   1716         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
   1717         const float* ky = kernel.ptr<float>() + ksize2;
   1718         int i = 0, k;
   1719         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   1720         const float** src = (const float**)_src;
   1721         const float *S, *S2;
   1722         float* dst = (float*)_dst;
   1723         __m128 d4 = _mm_set1_ps(delta);
   1724 
   1725         if( symmetrical )
   1726         {
   1727             for( ; i <= width - 16; i += 16 )
   1728             {
   1729                 __m128 f = _mm_load_ss(ky);
   1730                 f = _mm_shuffle_ps(f, f, 0);
   1731                 __m128 s0, s1, s2, s3;
   1732                 __m128 x0, x1;
   1733                 S = src[0] + i;
   1734                 s0 = _mm_load_ps(S);
   1735                 s1 = _mm_load_ps(S+4);
   1736                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
   1737                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
   1738                 s2 = _mm_load_ps(S+8);
   1739                 s3 = _mm_load_ps(S+12);
   1740                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
   1741                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
   1742 
   1743                 for( k = 1; k <= ksize2; k++ )
   1744                 {
   1745                     S = src[k] + i;
   1746                     S2 = src[-k] + i;
   1747                     f = _mm_load_ss(ky+k);
   1748                     f = _mm_shuffle_ps(f, f, 0);
   1749                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
   1750                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
   1751                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1752                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
   1753                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
   1754                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
   1755                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
   1756                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
   1757                 }
   1758 
   1759                 _mm_storeu_ps(dst + i, s0);
   1760                 _mm_storeu_ps(dst + i + 4, s1);
   1761                 _mm_storeu_ps(dst + i + 8, s2);
   1762                 _mm_storeu_ps(dst + i + 12, s3);
   1763             }
   1764 
   1765             for( ; i <= width - 4; i += 4 )
   1766             {
   1767                 __m128 f = _mm_load_ss(ky);
   1768                 f = _mm_shuffle_ps(f, f, 0);
   1769                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
   1770                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
   1771 
   1772                 for( k = 1; k <= ksize2; k++ )
   1773                 {
   1774                     f = _mm_load_ss(ky+k);
   1775                     f = _mm_shuffle_ps(f, f, 0);
   1776                     S = src[k] + i;
   1777                     S2 = src[-k] + i;
   1778                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
   1779                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1780                 }
   1781 
   1782                 _mm_storeu_ps(dst + i, s0);
   1783             }
   1784         }
   1785         else
   1786         {
   1787             for( ; i <= width - 16; i += 16 )
   1788             {
   1789                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
   1790                 __m128 x0, x1;
   1791                 S = src[0] + i;
   1792 
   1793                 for( k = 1; k <= ksize2; k++ )
   1794                 {
   1795                     S = src[k] + i;
   1796                     S2 = src[-k] + i;
   1797                     f = _mm_load_ss(ky+k);
   1798                     f = _mm_shuffle_ps(f, f, 0);
   1799                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
   1800                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
   1801                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1802                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
   1803                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
   1804                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
   1805                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
   1806                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
   1807                 }
   1808 
   1809                 _mm_storeu_ps(dst + i, s0);
   1810                 _mm_storeu_ps(dst + i + 4, s1);
   1811                 _mm_storeu_ps(dst + i + 8, s2);
   1812                 _mm_storeu_ps(dst + i + 12, s3);
   1813             }
   1814 
   1815             for( ; i <= width - 4; i += 4 )
   1816             {
   1817                 __m128 f, x0, s0 = d4;
   1818 
   1819                 for( k = 1; k <= ksize2; k++ )
   1820                 {
   1821                     f = _mm_load_ss(ky+k);
   1822                     f = _mm_shuffle_ps(f, f, 0);
   1823                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
   1824                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
   1825                 }
   1826 
   1827                 _mm_storeu_ps(dst + i, s0);
   1828             }
   1829         }
   1830 
   1831         return i;
   1832     }
   1833 
   1834     int symmetryType;
   1835     float delta;
   1836     Mat kernel;
   1837 };
   1838 
   1839 
   1840 struct SymmColumnSmallVec_32f
   1841 {
   1842     SymmColumnSmallVec_32f() { symmetryType=0; }
   1843     SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
   1844     {
   1845         symmetryType = _symmetryType;
   1846         kernel = _kernel;
   1847         delta = (float)_delta;
   1848         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
   1849     }
   1850 
   1851     int operator()(const uchar** _src, uchar* _dst, int width) const
   1852     {
   1853         if( !checkHardwareSupport(CV_CPU_SSE) )
   1854             return 0;
   1855 
   1856         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
   1857         const float* ky = kernel.ptr<float>() + ksize2;
   1858         int i = 0;
   1859         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   1860         const float** src = (const float**)_src;
   1861         const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
   1862         float* dst = (float*)_dst;
   1863         __m128 d4 = _mm_set1_ps(delta);
   1864 
   1865         if( symmetrical )
   1866         {
   1867             if( ky[0] == 2 && ky[1] == 1 )
   1868             {
   1869                 for( ; i <= width - 8; i += 8 )
   1870                 {
   1871                     __m128 s0, s1, s2, s3, s4, s5;
   1872                     s0 = _mm_load_ps(S0 + i);
   1873                     s1 = _mm_load_ps(S0 + i + 4);
   1874                     s2 = _mm_load_ps(S1 + i);
   1875                     s3 = _mm_load_ps(S1 + i + 4);
   1876                     s4 = _mm_load_ps(S2 + i);
   1877                     s5 = _mm_load_ps(S2 + i + 4);
   1878                     s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
   1879                     s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
   1880                     s0 = _mm_add_ps(s0, d4);
   1881                     s1 = _mm_add_ps(s1, d4);
   1882                     _mm_storeu_ps(dst + i, s0);
   1883                     _mm_storeu_ps(dst + i + 4, s1);
   1884                 }
   1885             }
   1886             else if( ky[0] == -2 && ky[1] == 1 )
   1887             {
   1888                 for( ; i <= width - 8; i += 8 )
   1889                 {
   1890                     __m128 s0, s1, s2, s3, s4, s5;
   1891                     s0 = _mm_load_ps(S0 + i);
   1892                     s1 = _mm_load_ps(S0 + i + 4);
   1893                     s2 = _mm_load_ps(S1 + i);
   1894                     s3 = _mm_load_ps(S1 + i + 4);
   1895                     s4 = _mm_load_ps(S2 + i);
   1896                     s5 = _mm_load_ps(S2 + i + 4);
   1897                     s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
   1898                     s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
   1899                     s0 = _mm_add_ps(s0, d4);
   1900                     s1 = _mm_add_ps(s1, d4);
   1901                     _mm_storeu_ps(dst + i, s0);
   1902                     _mm_storeu_ps(dst + i + 4, s1);
   1903                 }
   1904             }
   1905             else
   1906             {
   1907                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
   1908                 for( ; i <= width - 8; i += 8 )
   1909                 {
   1910                     __m128 s0, s1, x0, x1;
   1911                     s0 = _mm_load_ps(S1 + i);
   1912                     s1 = _mm_load_ps(S1 + i + 4);
   1913                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
   1914                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
   1915                     x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
   1916                     x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
   1917                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
   1918                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
   1919                     _mm_storeu_ps(dst + i, s0);
   1920                     _mm_storeu_ps(dst + i + 4, s1);
   1921                 }
   1922             }
   1923         }
   1924         else
   1925         {
   1926             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
   1927             {
   1928                 if( ky[1] < 0 )
   1929                     std::swap(S0, S2);
   1930                 for( ; i <= width - 8; i += 8 )
   1931                 {
   1932                     __m128 s0, s1, s2, s3;
   1933                     s0 = _mm_load_ps(S2 + i);
   1934                     s1 = _mm_load_ps(S2 + i + 4);
   1935                     s2 = _mm_load_ps(S0 + i);
   1936                     s3 = _mm_load_ps(S0 + i + 4);
   1937                     s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
   1938                     s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
   1939                     _mm_storeu_ps(dst + i, s0);
   1940                     _mm_storeu_ps(dst + i + 4, s1);
   1941                 }
   1942             }
   1943             else
   1944             {
   1945                 __m128 k1 = _mm_set1_ps(ky[1]);
   1946                 for( ; i <= width - 8; i += 8 )
   1947                 {
   1948                     __m128 s0 = d4, s1 = d4, x0, x1;
   1949                     x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
   1950                     x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
   1951                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
   1952                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
   1953                     _mm_storeu_ps(dst + i, s0);
   1954                     _mm_storeu_ps(dst + i + 4, s1);
   1955                 }
   1956             }
   1957         }
   1958 
   1959         return i;
   1960     }
   1961 
   1962     int symmetryType;
   1963     float delta;
   1964     Mat kernel;
   1965 };
   1966 
   1967 
   1968 /////////////////////////////// non-separable filters ///////////////////////////////
   1969 
   1970 ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
   1971 
   1972 struct FilterVec_8u
   1973 {
   1974     FilterVec_8u() {}
   1975     FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
   1976     {
   1977         Mat kernel;
   1978         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
   1979         delta = (float)(_delta/(1 << _bits));
   1980         std::vector<Point> coords;
   1981         preprocess2DKernel(kernel, coords, coeffs);
   1982         _nz = (int)coords.size();
   1983     }
   1984 
   1985     int operator()(const uchar** src, uchar* dst, int width) const
   1986     {
   1987         if( !checkHardwareSupport(CV_CPU_SSE2) )
   1988             return 0;
   1989 
   1990         const float* kf = (const float*)&coeffs[0];
   1991         int i = 0, k, nz = _nz;
   1992         __m128 d4 = _mm_set1_ps(delta);
   1993 
   1994         for( ; i <= width - 16; i += 16 )
   1995         {
   1996             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
   1997             __m128i x0, x1, z = _mm_setzero_si128();
   1998 
   1999             for( k = 0; k < nz; k++ )
   2000             {
   2001                 __m128 f = _mm_load_ss(kf+k), t0, t1;
   2002                 f = _mm_shuffle_ps(f, f, 0);
   2003 
   2004                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
   2005                 x1 = _mm_unpackhi_epi8(x0, z);
   2006                 x0 = _mm_unpacklo_epi8(x0, z);
   2007 
   2008                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
   2009                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
   2010                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
   2011                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
   2012 
   2013                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
   2014                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
   2015                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
   2016                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
   2017             }
   2018 
   2019             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
   2020             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
   2021             x0 = _mm_packus_epi16(x0, x1);
   2022             _mm_storeu_si128((__m128i*)(dst + i), x0);
   2023         }
   2024 
   2025         for( ; i <= width - 4; i += 4 )
   2026         {
   2027             __m128 s0 = d4;
   2028             __m128i x0, z = _mm_setzero_si128();
   2029 
   2030             for( k = 0; k < nz; k++ )
   2031             {
   2032                 __m128 f = _mm_load_ss(kf+k), t0;
   2033                 f = _mm_shuffle_ps(f, f, 0);
   2034 
   2035                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
   2036                 x0 = _mm_unpacklo_epi8(x0, z);
   2037                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
   2038                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
   2039             }
   2040 
   2041             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
   2042             x0 = _mm_packus_epi16(x0, x0);
   2043             *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
   2044         }
   2045 
   2046         return i;
   2047     }
   2048 
   2049     int _nz;
   2050     std::vector<uchar> coeffs;
   2051     float delta;
   2052 };
   2053 
   2054 
   2055 struct FilterVec_8u16s
   2056 {
   2057     FilterVec_8u16s() {}
   2058     FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
   2059     {
   2060         Mat kernel;
   2061         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
   2062         delta = (float)(_delta/(1 << _bits));
   2063         std::vector<Point> coords;
   2064         preprocess2DKernel(kernel, coords, coeffs);
   2065         _nz = (int)coords.size();
   2066     }
   2067 
   2068     int operator()(const uchar** src, uchar* _dst, int width) const
   2069     {
   2070         if( !checkHardwareSupport(CV_CPU_SSE2) )
   2071             return 0;
   2072 
   2073         const float* kf = (const float*)&coeffs[0];
   2074         short* dst = (short*)_dst;
   2075         int i = 0, k, nz = _nz;
   2076         __m128 d4 = _mm_set1_ps(delta);
   2077 
   2078         for( ; i <= width - 16; i += 16 )
   2079         {
   2080             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
   2081             __m128i x0, x1, z = _mm_setzero_si128();
   2082 
   2083             for( k = 0; k < nz; k++ )
   2084             {
   2085                 __m128 f = _mm_load_ss(kf+k), t0, t1;
   2086                 f = _mm_shuffle_ps(f, f, 0);
   2087 
   2088                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
   2089                 x1 = _mm_unpackhi_epi8(x0, z);
   2090                 x0 = _mm_unpacklo_epi8(x0, z);
   2091 
   2092                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
   2093                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
   2094                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
   2095                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
   2096 
   2097                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
   2098                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
   2099                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
   2100                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
   2101             }
   2102 
   2103             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
   2104             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
   2105             _mm_storeu_si128((__m128i*)(dst + i), x0);
   2106             _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
   2107         }
   2108 
   2109         for( ; i <= width - 4; i += 4 )
   2110         {
   2111             __m128 s0 = d4;
   2112             __m128i x0, z = _mm_setzero_si128();
   2113 
   2114             for( k = 0; k < nz; k++ )
   2115             {
   2116                 __m128 f = _mm_load_ss(kf+k), t0;
   2117                 f = _mm_shuffle_ps(f, f, 0);
   2118 
   2119                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
   2120                 x0 = _mm_unpacklo_epi8(x0, z);
   2121                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
   2122                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
   2123             }
   2124 
   2125             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
   2126             _mm_storel_epi64((__m128i*)(dst + i), x0);
   2127         }
   2128 
   2129         return i;
   2130     }
   2131 
   2132     int _nz;
   2133     std::vector<uchar> coeffs;
   2134     float delta;
   2135 };
   2136 
   2137 
   2138 struct FilterVec_32f
   2139 {
   2140     FilterVec_32f() {}
   2141     FilterVec_32f(const Mat& _kernel, int, double _delta)
   2142     {
   2143         delta = (float)_delta;
   2144         std::vector<Point> coords;
   2145         preprocess2DKernel(_kernel, coords, coeffs);
   2146         _nz = (int)coords.size();
   2147     }
   2148 
   2149     int operator()(const uchar** _src, uchar* _dst, int width) const
   2150     {
   2151         if( !checkHardwareSupport(CV_CPU_SSE) )
   2152             return 0;
   2153 
   2154         const float* kf = (const float*)&coeffs[0];
   2155         const float** src = (const float**)_src;
   2156         float* dst = (float*)_dst;
   2157         int i = 0, k, nz = _nz;
   2158         __m128 d4 = _mm_set1_ps(delta);
   2159 
   2160         for( ; i <= width - 16; i += 16 )
   2161         {
   2162             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
   2163 
   2164             for( k = 0; k < nz; k++ )
   2165             {
   2166                 __m128 f = _mm_load_ss(kf+k), t0, t1;
   2167                 f = _mm_shuffle_ps(f, f, 0);
   2168                 const float* S = src[k] + i;
   2169 
   2170                 t0 = _mm_loadu_ps(S);
   2171                 t1 = _mm_loadu_ps(S + 4);
   2172                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
   2173                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
   2174 
   2175                 t0 = _mm_loadu_ps(S + 8);
   2176                 t1 = _mm_loadu_ps(S + 12);
   2177                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
   2178                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
   2179             }
   2180 
   2181             _mm_storeu_ps(dst + i, s0);
   2182             _mm_storeu_ps(dst + i + 4, s1);
   2183             _mm_storeu_ps(dst + i + 8, s2);
   2184             _mm_storeu_ps(dst + i + 12, s3);
   2185         }
   2186 
   2187         for( ; i <= width - 4; i += 4 )
   2188         {
   2189             __m128 s0 = d4;
   2190 
   2191             for( k = 0; k < nz; k++ )
   2192             {
   2193                 __m128 f = _mm_load_ss(kf+k), t0;
   2194                 f = _mm_shuffle_ps(f, f, 0);
   2195                 t0 = _mm_loadu_ps(src[k] + i);
   2196                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
   2197             }
   2198             _mm_storeu_ps(dst + i, s0);
   2199         }
   2200 
   2201         return i;
   2202     }
   2203 
   2204     int _nz;
   2205     std::vector<uchar> coeffs;
   2206     float delta;
   2207 };
   2208 
   2209 
   2210 #elif CV_NEON
   2211 
   2212 struct SymmRowSmallVec_8u32s
   2213 {
   2214     SymmRowSmallVec_8u32s() { smallValues = false; }
   2215     SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
   2216     {
   2217         kernel = _kernel;
   2218         symmetryType = _symmetryType;
   2219         smallValues = true;
   2220         int k, ksize = kernel.rows + kernel.cols - 1;
   2221         for( k = 0; k < ksize; k++ )
   2222         {
   2223             int v = kernel.ptr<int>()[k];
   2224             if( v < SHRT_MIN || v > SHRT_MAX )
   2225             {
   2226                 smallValues = false;
   2227                 break;
   2228             }
   2229         }
   2230     }
   2231 
   2232     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
   2233     {
   2234          if( !checkHardwareSupport(CV_CPU_NEON) )
   2235              return 0;
   2236 
   2237         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
   2238         int* dst = (int*)_dst;
   2239         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   2240         const int* kx = kernel.ptr<int>() + _ksize/2;
   2241         if( !smallValues )
   2242             return 0;
   2243 
   2244         src += (_ksize/2)*cn;
   2245         width *= cn;
   2246 
   2247         if( symmetrical )
   2248         {
   2249             if( _ksize == 1 )
   2250                 return 0;
   2251             if( _ksize == 3 )
   2252             {
   2253                 if( kx[0] == 2 && kx[1] == 1 )
   2254                 {
   2255                     uint16x8_t zq = vdupq_n_u16(0);
   2256 
   2257                     for( ; i <= width - 8; i += 8, src += 8 )
   2258                     {
   2259                         uint8x8_t x0, x1, x2;
   2260                         x0 = vld1_u8( (uint8_t *) (src - cn) );
   2261                         x1 = vld1_u8( (uint8_t *) (src) );
   2262                         x2 = vld1_u8( (uint8_t *) (src + cn) );
   2263 
   2264                         uint16x8_t y0, y1, y2;
   2265                         y0 = vaddl_u8(x0, x2);
   2266                         y1 = vshll_n_u8(x1, 1);
   2267                         y2 = vaddq_u16(y0, y1);
   2268 
   2269                         uint16x8x2_t str;
   2270                         str.val[0] = y2; str.val[1] = zq;
   2271                         vst2q_u16( (uint16_t *) (dst + i), str );
   2272                     }
   2273                 }
   2274                 else if( kx[0] == -2 && kx[1] == 1 )
   2275                     return 0;
   2276                 else
   2277                 {
   2278                     int32x4_t k32 = vdupq_n_s32(0);
   2279                     k32 = vld1q_lane_s32(kx, k32, 0);
   2280                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
   2281 
   2282                     int16x4_t k = vqmovn_s32(k32);
   2283 
   2284                     uint8x8_t z = vdup_n_u8(0);
   2285 
   2286                     for( ; i <= width - 8; i += 8, src += 8 )
   2287                     {
   2288                         uint8x8_t x0, x1, x2;
   2289                         x0 = vld1_u8( (uint8_t *) (src - cn) );
   2290                         x1 = vld1_u8( (uint8_t *) (src) );
   2291                         x2 = vld1_u8( (uint8_t *) (src + cn) );
   2292 
   2293                         int16x8_t y0, y1;
   2294                         int32x4_t y2, y3;
   2295                         y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
   2296                         y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
   2297                         y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
   2298                         y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
   2299                         y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
   2300                         y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
   2301 
   2302                         vst1q_s32((int32_t *)(dst + i), y2);
   2303                         vst1q_s32((int32_t *)(dst + i + 4), y3);
   2304                     }
   2305                 }
   2306             }
   2307             else if( _ksize == 5 )
   2308             {
   2309                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
   2310                     return 0;
   2311                 else
   2312                 {
   2313                     int32x4_t k32 = vdupq_n_s32(0);
   2314                     k32 = vld1q_lane_s32(kx, k32, 0);
   2315                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
   2316                     k32 = vld1q_lane_s32(kx + 2, k32, 2);
   2317 
   2318                     int16x4_t k = vqmovn_s32(k32);
   2319 
   2320                     uint8x8_t z = vdup_n_u8(0);
   2321 
   2322                     for( ; i <= width - 8; i += 8, src += 8 )
   2323                     {
   2324                         uint8x8_t x0, x1, x2, x3, x4;
   2325                         x0 = vld1_u8( (uint8_t *) (src - cn) );
   2326                         x1 = vld1_u8( (uint8_t *) (src) );
   2327                         x2 = vld1_u8( (uint8_t *) (src + cn) );
   2328 
   2329                         int16x8_t y0, y1;
   2330                         int32x4_t accl, acch;
   2331                         y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
   2332                         y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
   2333                         accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
   2334                         accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
   2335                         acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
   2336                         acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
   2337 
   2338                         int16x8_t y2;
   2339                         x3 = vld1_u8( (uint8_t *) (src - cn*2) );
   2340                         x4 = vld1_u8( (uint8_t *) (src + cn*2) );
   2341                         y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
   2342                         accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
   2343                         acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
   2344 
   2345                         vst1q_s32((int32_t *)(dst + i), accl);
   2346                         vst1q_s32((int32_t *)(dst + i + 4), acch);
   2347                     }
   2348                 }
   2349             }
   2350         }
   2351         else
   2352         {
   2353             if( _ksize == 3 )
   2354             {
   2355                 if( kx[0] == 0 && kx[1] == 1 )
   2356                 {
   2357                     uint8x8_t z = vdup_n_u8(0);
   2358 
   2359                     for( ; i <= width - 8; i += 8, src += 8 )
   2360                     {
   2361                         uint8x8_t x0, x1;
   2362                         x0 = vld1_u8( (uint8_t *) (src - cn) );
   2363                         x1 = vld1_u8( (uint8_t *) (src + cn) );
   2364 
   2365                         int16x8_t y0;
   2366                         y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
   2367                                 vreinterpretq_s16_u16(vaddl_u8(x0, z)));
   2368 
   2369                         vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
   2370                         vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
   2371                     }
   2372                 }
   2373                 else
   2374                 {
   2375                     int32x4_t k32 = vdupq_n_s32(0);
   2376                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
   2377 
   2378                     int16x4_t k = vqmovn_s32(k32);
   2379 
   2380                     uint8x8_t z = vdup_n_u8(0);
   2381 
   2382                     for( ; i <= width - 8; i += 8, src += 8 )
   2383                     {
   2384                         uint8x8_t x0, x1;
   2385                         x0 = vld1_u8( (uint8_t *) (src - cn) );
   2386                         x1 = vld1_u8( (uint8_t *) (src + cn) );
   2387 
   2388                         int16x8_t y0;
   2389                         int32x4_t y1, y2;
   2390                         y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
   2391                             vreinterpretq_s16_u16(vaddl_u8(x0, z)));
   2392                         y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
   2393                         y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
   2394 
   2395                         vst1q_s32((int32_t *)(dst + i), y1);
   2396                         vst1q_s32((int32_t *)(dst + i + 4), y2);
   2397                     }
   2398                 }
   2399             }
   2400             else if( _ksize == 5 )
   2401             {
   2402                 int32x4_t k32 = vdupq_n_s32(0);
   2403                 k32 = vld1q_lane_s32(kx + 1, k32, 1);
   2404                 k32 = vld1q_lane_s32(kx + 2, k32, 2);
   2405 
   2406                 int16x4_t k = vqmovn_s32(k32);
   2407 
   2408                 uint8x8_t z = vdup_n_u8(0);
   2409 
   2410                 for( ; i <= width - 8; i += 8, src += 8 )
   2411                 {
   2412                     uint8x8_t x0, x1;
   2413                     x0 = vld1_u8( (uint8_t *) (src - cn) );
   2414                     x1 = vld1_u8( (uint8_t *) (src + cn) );
   2415 
   2416                     int32x4_t accl, acch;
   2417                     int16x8_t y0;
   2418                     y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
   2419                         vreinterpretq_s16_u16(vaddl_u8(x0, z)));
   2420                     accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
   2421                     acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
   2422 
   2423                     uint8x8_t x2, x3;
   2424                     x2 = vld1_u8( (uint8_t *) (src - cn*2) );
   2425                     x3 = vld1_u8( (uint8_t *) (src + cn*2) );
   2426 
   2427                     int16x8_t y1;
   2428                     y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
   2429                         vreinterpretq_s16_u16(vaddl_u8(x2, z)));
   2430                     accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
   2431                     acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
   2432 
   2433                     vst1q_s32((int32_t *)(dst + i), accl);
   2434                     vst1q_s32((int32_t *)(dst + i + 4), acch);
   2435                 }
   2436             }
   2437         }
   2438 
   2439         return i;
   2440     }
   2441 
   2442     Mat kernel;
   2443     int symmetryType;
   2444     bool smallValues;
   2445 };
   2446 
   2447 
   2448 struct SymmColumnVec_32s8u
   2449 {
   2450     SymmColumnVec_32s8u() { symmetryType=0; }
   2451     SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
   2452     {
   2453         symmetryType = _symmetryType;
   2454         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
   2455         delta = (float)(_delta/(1 << _bits));
   2456         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
   2457     }
   2458 
   2459     int operator()(const uchar** _src, uchar* dst, int width) const
   2460     {
   2461          if( !checkHardwareSupport(CV_CPU_NEON) )
   2462              return 0;
   2463 
   2464         int _ksize = kernel.rows + kernel.cols - 1;
   2465         int ksize2 = _ksize / 2;
   2466         const float* ky = kernel.ptr<float>() + ksize2;
   2467         int i = 0, k;
   2468         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   2469         const int** src = (const int**)_src;
   2470         const int *S, *S2;
   2471 
   2472         float32x4_t d4 = vdupq_n_f32(delta);
   2473 
   2474         if( symmetrical )
   2475         {
   2476             if( _ksize == 1 )
   2477                 return 0;
   2478 
   2479 
   2480             float32x2_t k32;
   2481             k32 = vdup_n_f32(0);
   2482             k32 = vld1_lane_f32(ky, k32, 0);
   2483             k32 = vld1_lane_f32(ky + 1, k32, 1);
   2484 
   2485             for( ; i <= width - 8; i += 8 )
   2486             {
   2487                 float32x4_t accl, acch;
   2488                 float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
   2489 
   2490                 S = src[0] + i;
   2491 
   2492                 f0l = vcvtq_f32_s32( vld1q_s32(S) );
   2493                 f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
   2494 
   2495                 S = src[1] + i;
   2496                 S2 = src[-1] + i;
   2497 
   2498                 f1l = vcvtq_f32_s32( vld1q_s32(S) );
   2499                 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
   2500                 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
   2501                 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
   2502 
   2503                 accl = acch = d4;
   2504                 accl = vmlaq_lane_f32(accl, f0l, k32, 0);
   2505                 acch = vmlaq_lane_f32(acch, f0h, k32, 0);
   2506                 accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
   2507                 acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
   2508 
   2509                 for( k = 2; k <= ksize2; k++ )
   2510                 {
   2511                     S = src[k] + i;
   2512                     S2 = src[-k] + i;
   2513 
   2514                     float32x4_t f3l, f3h, f4l, f4h;
   2515                     f3l = vcvtq_f32_s32( vld1q_s32(S) );
   2516                     f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
   2517                     f4l = vcvtq_f32_s32( vld1q_s32(S2) );
   2518                     f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
   2519 
   2520                     accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
   2521                     acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
   2522                 }
   2523 
   2524                 int32x4_t s32l, s32h;
   2525                 s32l = vcvtq_s32_f32(accl);
   2526                 s32h = vcvtq_s32_f32(acch);
   2527 
   2528                 int16x4_t s16l, s16h;
   2529                 s16l = vqmovn_s32(s32l);
   2530                 s16h = vqmovn_s32(s32h);
   2531 
   2532                 uint8x8_t u8;
   2533                 u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
   2534 
   2535                 vst1_u8((uint8_t *)(dst + i), u8);
   2536             }
   2537         }
   2538         else
   2539         {
   2540             float32x2_t k32;
   2541             k32 = vdup_n_f32(0);
   2542             k32 = vld1_lane_f32(ky + 1, k32, 1);
   2543 
   2544             for( ; i <= width - 8; i += 8 )
   2545             {
   2546                 float32x4_t accl, acch;
   2547                 float32x4_t f1l, f1h, f2l, f2h;
   2548 
   2549                 S = src[1] + i;
   2550                 S2 = src[-1] + i;
   2551 
   2552                 f1l = vcvtq_f32_s32( vld1q_s32(S) );
   2553                 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
   2554                 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
   2555                 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
   2556 
   2557                 accl = acch = d4;
   2558                 accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
   2559                 acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
   2560 
   2561                 for( k = 2; k <= ksize2; k++ )
   2562                 {
   2563                     S = src[k] + i;
   2564                     S2 = src[-k] + i;
   2565 
   2566                     float32x4_t f3l, f3h, f4l, f4h;
   2567                     f3l = vcvtq_f32_s32( vld1q_s32(S) );
   2568                     f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
   2569                     f4l = vcvtq_f32_s32( vld1q_s32(S2) );
   2570                     f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
   2571 
   2572                     accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
   2573                     acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
   2574                 }
   2575 
   2576                 int32x4_t s32l, s32h;
   2577                 s32l = vcvtq_s32_f32(accl);
   2578                 s32h = vcvtq_s32_f32(acch);
   2579 
   2580                 int16x4_t s16l, s16h;
   2581                 s16l = vqmovn_s32(s32l);
   2582                 s16h = vqmovn_s32(s32h);
   2583 
   2584                 uint8x8_t u8;
   2585                 u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
   2586 
   2587                 vst1_u8((uint8_t *)(dst + i), u8);
   2588             }
   2589         }
   2590 
   2591         return i;
   2592     }
   2593 
   2594     int symmetryType;
   2595     float delta;
   2596     Mat kernel;
   2597 };
   2598 
   2599 
   2600 struct SymmColumnSmallVec_32s16s
   2601 {
   2602     SymmColumnSmallVec_32s16s() { symmetryType=0; }
   2603     SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
   2604     {
   2605         symmetryType = _symmetryType;
   2606         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
   2607         delta = (float)(_delta/(1 << _bits));
   2608         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
   2609     }
   2610 
   2611     int operator()(const uchar** _src, uchar* _dst, int width) const
   2612     {
   2613          if( !checkHardwareSupport(CV_CPU_NEON) )
   2614              return 0;
   2615 
   2616         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
   2617         const float* ky = kernel.ptr<float>() + ksize2;
   2618         int i = 0;
   2619         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   2620         const int** src = (const int**)_src;
   2621         const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
   2622         short* dst = (short*)_dst;
   2623         float32x4_t df4 = vdupq_n_f32(delta);
   2624         int32x4_t d4 = vcvtq_s32_f32(df4);
   2625 
   2626         if( symmetrical )
   2627         {
   2628             if( ky[0] == 2 && ky[1] == 1 )
   2629             {
   2630                 for( ; i <= width - 4; i += 4 )
   2631                 {
   2632                     int32x4_t x0, x1, x2;
   2633                     x0 = vld1q_s32((int32_t const *)(S0 + i));
   2634                     x1 = vld1q_s32((int32_t const *)(S1 + i));
   2635                     x2 = vld1q_s32((int32_t const *)(S2 + i));
   2636 
   2637                     int32x4_t y0, y1, y2, y3;
   2638                     y0 = vaddq_s32(x0, x2);
   2639                     y1 = vqshlq_n_s32(x1, 1);
   2640                     y2 = vaddq_s32(y0, y1);
   2641                     y3 = vaddq_s32(y2, d4);
   2642 
   2643                     int16x4_t t;
   2644                     t = vqmovn_s32(y3);
   2645 
   2646                     vst1_s16((int16_t *)(dst + i), t);
   2647                 }
   2648             }
   2649             else if( ky[0] == -2 && ky[1] == 1 )
   2650             {
   2651                 for( ; i <= width - 4; i += 4 )
   2652                 {
   2653                     int32x4_t x0, x1, x2;
   2654                     x0 = vld1q_s32((int32_t const *)(S0 + i));
   2655                     x1 = vld1q_s32((int32_t const *)(S1 + i));
   2656                     x2 = vld1q_s32((int32_t const *)(S2 + i));
   2657 
   2658                     int32x4_t y0, y1, y2, y3;
   2659                     y0 = vaddq_s32(x0, x2);
   2660                     y1 = vqshlq_n_s32(x1, 1);
   2661                     y2 = vsubq_s32(y0, y1);
   2662                     y3 = vaddq_s32(y2, d4);
   2663 
   2664                     int16x4_t t;
   2665                     t = vqmovn_s32(y3);
   2666 
   2667                     vst1_s16((int16_t *)(dst + i), t);
   2668                 }
   2669             }
   2670             else if( ky[0] == 10 && ky[1] == 3 )
   2671             {
   2672                 for( ; i <= width - 4; i += 4 )
   2673                 {
   2674                     int32x4_t x0, x1, x2, x3;
   2675                     x0 = vld1q_s32((int32_t const *)(S0 + i));
   2676                     x1 = vld1q_s32((int32_t const *)(S1 + i));
   2677                     x2 = vld1q_s32((int32_t const *)(S2 + i));
   2678 
   2679                     x3 = vaddq_s32(x0, x2);
   2680 
   2681                     int32x4_t y0;
   2682                     y0 = vmlaq_n_s32(d4, x1, 10);
   2683                     y0 = vmlaq_n_s32(y0, x3, 3);
   2684 
   2685                     int16x4_t t;
   2686                     t = vqmovn_s32(y0);
   2687 
   2688                     vst1_s16((int16_t *)(dst + i), t);
   2689                 }
   2690             }
   2691             else
   2692             {
   2693                 float32x2_t k32 = vdup_n_f32(0);
   2694                 k32 = vld1_lane_f32(ky, k32, 0);
   2695                 k32 = vld1_lane_f32(ky + 1, k32, 1);
   2696 
   2697                 for( ; i <= width - 4; i += 4 )
   2698                 {
   2699                     int32x4_t x0, x1, x2, x3, x4;
   2700                     x0 = vld1q_s32((int32_t const *)(S0 + i));
   2701                     x1 = vld1q_s32((int32_t const *)(S1 + i));
   2702                     x2 = vld1q_s32((int32_t const *)(S2 + i));
   2703 
   2704                     x3 = vaddq_s32(x0, x2);
   2705 
   2706                     float32x4_t s0, s1, s2;
   2707                     s0 = vcvtq_f32_s32(x1);
   2708                     s1 = vcvtq_f32_s32(x3);
   2709                     s2 = vmlaq_lane_f32(df4, s0, k32, 0);
   2710                     s2 = vmlaq_lane_f32(s2, s1, k32, 1);
   2711 
   2712                     x4 = vcvtq_s32_f32(s2);
   2713 
   2714                     int16x4_t x5;
   2715                     x5 = vqmovn_s32(x4);
   2716 
   2717                     vst1_s16((int16_t *)(dst + i), x5);
   2718                 }
   2719             }
   2720         }
   2721         else
   2722         {
   2723             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
   2724             {
   2725                 if( ky[1] < 0 )
   2726                     std::swap(S0, S2);
   2727                 for( ; i <= width - 4; i += 4 )
   2728                 {
   2729                     int32x4_t x0, x1;
   2730                     x0 = vld1q_s32((int32_t const *)(S0 + i));
   2731                     x1 = vld1q_s32((int32_t const *)(S2 + i));
   2732 
   2733                     int32x4_t y0, y1;
   2734                     y0 = vsubq_s32(x1, x0);
   2735                     y1 = vqaddq_s32(y0, d4);
   2736 
   2737                     int16x4_t t;
   2738                     t = vqmovn_s32(y1);
   2739 
   2740                     vst1_s16((int16_t *)(dst + i), t);
   2741                 }
   2742             }
   2743             else
   2744             {
   2745                 float32x2_t k32 = vdup_n_f32(0);
   2746                 k32 = vld1_lane_f32(ky + 1, k32, 1);
   2747 
   2748                 for( ; i <= width - 4; i += 4 )
   2749                 {
   2750                     int32x4_t x0, x1, x2, x3;
   2751                     x0 = vld1q_s32((int32_t const *)(S0 + i));
   2752                     x1 = vld1q_s32((int32_t const *)(S2 + i));
   2753 
   2754                     x2 = vsubq_s32(x1, x0);
   2755 
   2756                     float32x4_t s0, s1;
   2757                     s0 = vcvtq_f32_s32(x2);
   2758                     s1 = vmlaq_lane_f32(df4, s0, k32, 1);
   2759 
   2760                     x3 = vcvtq_s32_f32(s1);
   2761 
   2762                     int16x4_t x4;
   2763                     x4 = vqmovn_s32(x3);
   2764 
   2765                     vst1_s16((int16_t *)(dst + i), x4);
   2766                 }
   2767             }
   2768         }
   2769 
   2770         return i;
   2771     }
   2772 
   2773     int symmetryType;
   2774     float delta;
   2775     Mat kernel;
   2776 };
   2777 
   2778 
   2779 struct SymmColumnVec_32f16s
   2780 {
   2781     SymmColumnVec_32f16s() { symmetryType=0; }
   2782     SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
   2783     {
   2784         symmetryType = _symmetryType;
   2785         kernel = _kernel;
   2786         delta = (float)_delta;
   2787         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
   2788          neon_supported = checkHardwareSupport(CV_CPU_NEON);
   2789     }
   2790 
   2791     int operator()(const uchar** _src, uchar* _dst, int width) const
   2792     {
   2793          if( !neon_supported )
   2794              return 0;
   2795 
   2796         int _ksize = kernel.rows + kernel.cols - 1;
   2797         int ksize2 = _ksize / 2;
   2798         const float* ky = kernel.ptr<float>() + ksize2;
   2799         int i = 0, k;
   2800         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   2801         const float** src = (const float**)_src;
   2802         const float *S, *S2;
   2803         short* dst = (short*)_dst;
   2804 
   2805         float32x4_t d4 = vdupq_n_f32(delta);
   2806 
   2807         if( symmetrical )
   2808         {
   2809             if( _ksize == 1 )
   2810                 return 0;
   2811 
   2812 
   2813             float32x2_t k32;
   2814             k32 = vdup_n_f32(0);
   2815             k32 = vld1_lane_f32(ky, k32, 0);
   2816             k32 = vld1_lane_f32(ky + 1, k32, 1);
   2817 
   2818             for( ; i <= width - 8; i += 8 )
   2819             {
   2820                 float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
   2821                 float32x4_t accl, acch;
   2822 
   2823                 S = src[0] + i;
   2824 
   2825                 x0l = vld1q_f32(S);
   2826                 x0h = vld1q_f32(S + 4);
   2827 
   2828                 S = src[1] + i;
   2829                 S2 = src[-1] + i;
   2830 
   2831                 x1l = vld1q_f32(S);
   2832                 x1h = vld1q_f32(S + 4);
   2833                 x2l = vld1q_f32(S2);
   2834                 x2h = vld1q_f32(S2 + 4);
   2835 
   2836                 accl = acch = d4;
   2837                 accl = vmlaq_lane_f32(accl, x0l, k32, 0);
   2838                 acch = vmlaq_lane_f32(acch, x0h, k32, 0);
   2839                 accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
   2840                 acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
   2841 
   2842                 for( k = 2; k <= ksize2; k++ )
   2843                 {
   2844                     S = src[k] + i;
   2845                     S2 = src[-k] + i;
   2846 
   2847                     float32x4_t x3l, x3h, x4l, x4h;
   2848                     x3l = vld1q_f32(S);
   2849                     x3h = vld1q_f32(S + 4);
   2850                     x4l = vld1q_f32(S2);
   2851                     x4h = vld1q_f32(S2 + 4);
   2852 
   2853                     accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
   2854                     acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
   2855                 }
   2856 
   2857                 int32x4_t s32l, s32h;
   2858                 s32l = vcvtq_s32_f32(accl);
   2859                 s32h = vcvtq_s32_f32(acch);
   2860 
   2861                 int16x4_t s16l, s16h;
   2862                 s16l = vqmovn_s32(s32l);
   2863                 s16h = vqmovn_s32(s32h);
   2864 
   2865                 vst1_s16((int16_t *)(dst + i), s16l);
   2866                 vst1_s16((int16_t *)(dst + i + 4), s16h);
   2867             }
   2868         }
   2869         else
   2870         {
   2871             float32x2_t k32;
   2872             k32 = vdup_n_f32(0);
   2873             k32 = vld1_lane_f32(ky + 1, k32, 1);
   2874 
   2875             for( ; i <= width - 8; i += 8 )
   2876             {
   2877                 float32x4_t x1l, x1h, x2l, x2h;
   2878                 float32x4_t accl, acch;
   2879 
   2880                 S = src[1] + i;
   2881                 S2 = src[-1] + i;
   2882 
   2883                 x1l = vld1q_f32(S);
   2884                 x1h = vld1q_f32(S + 4);
   2885                 x2l = vld1q_f32(S2);
   2886                 x2h = vld1q_f32(S2 + 4);
   2887 
   2888                 accl = acch = d4;
   2889                 accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
   2890                 acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
   2891 
   2892                 for( k = 2; k <= ksize2; k++ )
   2893                 {
   2894                     S = src[k] + i;
   2895                     S2 = src[-k] + i;
   2896 
   2897                     float32x4_t x3l, x3h, x4l, x4h;
   2898                     x3l = vld1q_f32(S);
   2899                     x3h = vld1q_f32(S + 4);
   2900                     x4l = vld1q_f32(S2);
   2901                     x4h = vld1q_f32(S2 + 4);
   2902 
   2903                     accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
   2904                     acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
   2905                 }
   2906 
   2907                 int32x4_t s32l, s32h;
   2908                 s32l = vcvtq_s32_f32(accl);
   2909                 s32h = vcvtq_s32_f32(acch);
   2910 
   2911                 int16x4_t s16l, s16h;
   2912                 s16l = vqmovn_s32(s32l);
   2913                 s16h = vqmovn_s32(s32h);
   2914 
   2915                 vst1_s16((int16_t *)(dst + i), s16l);
   2916                 vst1_s16((int16_t *)(dst + i + 4), s16h);
   2917             }
   2918         }
   2919 
   2920         return i;
   2921     }
   2922 
   2923     int symmetryType;
   2924     float delta;
   2925     Mat kernel;
   2926     bool neon_supported;
   2927 };
   2928 
   2929 
   2930 struct SymmRowSmallVec_32f
   2931 {
   2932     SymmRowSmallVec_32f() {}
   2933     SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
   2934     {
   2935         kernel = _kernel;
   2936         symmetryType = _symmetryType;
   2937     }
   2938 
   2939     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
   2940     {
   2941          if( !checkHardwareSupport(CV_CPU_NEON) )
   2942              return 0;
   2943 
   2944         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
   2945         float* dst = (float*)_dst;
   2946         const float* src = (const float*)_src + (_ksize/2)*cn;
   2947         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   2948         const float* kx = kernel.ptr<float>() + _ksize/2;
   2949         width *= cn;
   2950 
   2951         if( symmetrical )
   2952         {
   2953             if( _ksize == 1 )
   2954                 return 0;
   2955             if( _ksize == 3 )
   2956             {
   2957                 if( kx[0] == 2 && kx[1] == 1 )
   2958                     return 0;
   2959                 else if( kx[0] == -2 && kx[1] == 1 )
   2960                     return 0;
   2961                 else
   2962                 {
   2963                     return 0;
   2964                 }
   2965             }
   2966             else if( _ksize == 5 )
   2967             {
   2968                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
   2969                     return 0;
   2970                 else
   2971                 {
   2972                     float32x2_t k0, k1;
   2973                     k0 = k1 = vdup_n_f32(0);
   2974                     k0 = vld1_lane_f32(kx + 0, k0, 0);
   2975                     k0 = vld1_lane_f32(kx + 1, k0, 1);
   2976                     k1 = vld1_lane_f32(kx + 2, k1, 0);
   2977 
   2978                     for( ; i <= width - 4; i += 4, src += 4 )
   2979                     {
   2980                         float32x4_t x0, x1, x2, x3, x4;
   2981                         x0 = vld1q_f32(src);
   2982                         x1 = vld1q_f32(src - cn);
   2983                         x2 = vld1q_f32(src + cn);
   2984                         x3 = vld1q_f32(src - cn*2);
   2985                         x4 = vld1q_f32(src + cn*2);
   2986 
   2987                         float32x4_t y0;
   2988                         y0 = vmulq_lane_f32(x0, k0, 0);
   2989                         y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
   2990                         y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
   2991 
   2992                         vst1q_f32(dst + i, y0);
   2993                     }
   2994                 }
   2995             }
   2996         }
   2997         else
   2998         {
   2999             if( _ksize == 3 )
   3000             {
   3001                 if( kx[0] == 0 && kx[1] == 1 )
   3002                     return 0;
   3003                 else
   3004                 {
   3005                     return 0;
   3006                 }
   3007             }
   3008             else if( _ksize == 5 )
   3009             {
   3010                 float32x2_t k;
   3011                 k = vdup_n_f32(0);
   3012                 k = vld1_lane_f32(kx + 1, k, 0);
   3013                 k = vld1_lane_f32(kx + 2, k, 1);
   3014 
   3015                 for( ; i <= width - 4; i += 4, src += 4 )
   3016                 {
   3017                     float32x4_t x0, x1, x2, x3;
   3018                     x0 = vld1q_f32(src - cn);
   3019                     x1 = vld1q_f32(src + cn);
   3020                     x2 = vld1q_f32(src - cn*2);
   3021                     x3 = vld1q_f32(src + cn*2);
   3022 
   3023                     float32x4_t y0;
   3024                     y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
   3025                     y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
   3026 
   3027                     vst1q_f32(dst + i, y0);
   3028                 }
   3029             }
   3030         }
   3031 
   3032         return i;
   3033     }
   3034 
   3035     Mat kernel;
   3036     int symmetryType;
   3037 };
   3038 
   3039 
   3040 typedef RowNoVec RowVec_8u32s;
   3041 typedef RowNoVec RowVec_16s32f;
   3042 typedef RowNoVec RowVec_32f;
   3043 typedef ColumnNoVec SymmColumnVec_32f;
   3044 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
   3045 typedef FilterNoVec FilterVec_8u;
   3046 typedef FilterNoVec FilterVec_8u16s;
   3047 typedef FilterNoVec FilterVec_32f;
   3048 
   3049 
   3050 #else
   3051 
   3052 typedef RowNoVec RowVec_8u32s;
   3053 typedef RowNoVec RowVec_16s32f;
   3054 typedef RowNoVec RowVec_32f;
   3055 typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
   3056 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
   3057 typedef ColumnNoVec SymmColumnVec_32s8u;
   3058 typedef ColumnNoVec SymmColumnVec_32f16s;
   3059 typedef ColumnNoVec SymmColumnVec_32f;
   3060 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
   3061 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
   3062 typedef FilterNoVec FilterVec_8u;
   3063 typedef FilterNoVec FilterVec_8u16s;
   3064 typedef FilterNoVec FilterVec_32f;
   3065 
   3066 #endif
   3067 
   3068 
   3069 template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
   3070 {
   3071     RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
   3072     {
   3073         if( _kernel.isContinuous() )
   3074             kernel = _kernel;
   3075         else
   3076             _kernel.copyTo(kernel);
   3077         anchor = _anchor;
   3078         ksize = kernel.rows + kernel.cols - 1;
   3079         CV_Assert( kernel.type() == DataType<DT>::type &&
   3080                    (kernel.rows == 1 || kernel.cols == 1));
   3081         vecOp = _vecOp;
   3082     }
   3083 
   3084     void operator()(const uchar* src, uchar* dst, int width, int cn)
   3085     {
   3086         int _ksize = ksize;
   3087         const DT* kx = kernel.ptr<DT>();
   3088         const ST* S;
   3089         DT* D = (DT*)dst;
   3090         int i, k;
   3091 
   3092         i = vecOp(src, dst, width, cn);
   3093         width *= cn;
   3094         #if CV_ENABLE_UNROLLED
   3095         for( ; i <= width - 4; i += 4 )
   3096         {
   3097             S = (const ST*)src + i;
   3098             DT f = kx[0];
   3099             DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
   3100 
   3101             for( k = 1; k < _ksize; k++ )
   3102             {
   3103                 S += cn;
   3104                 f = kx[k];
   3105                 s0 += f*S[0]; s1 += f*S[1];
   3106                 s2 += f*S[2]; s3 += f*S[3];
   3107             }
   3108 
   3109             D[i] = s0; D[i+1] = s1;
   3110             D[i+2] = s2; D[i+3] = s3;
   3111         }
   3112         #endif
   3113         for( ; i < width; i++ )
   3114         {
   3115             S = (const ST*)src + i;
   3116             DT s0 = kx[0]*S[0];
   3117             for( k = 1; k < _ksize; k++ )
   3118             {
   3119                 S += cn;
   3120                 s0 += kx[k]*S[0];
   3121             }
   3122             D[i] = s0;
   3123         }
   3124     }
   3125 
   3126     Mat kernel;
   3127     VecOp vecOp;
   3128 };
   3129 
   3130 
   3131 template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
   3132     public RowFilter<ST, DT, VecOp>
   3133 {
   3134     SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
   3135                         const VecOp& _vecOp = VecOp())
   3136         : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
   3137     {
   3138         symmetryType = _symmetryType;
   3139         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
   3140     }
   3141 
   3142     void operator()(const uchar* src, uchar* dst, int width, int cn)
   3143     {
   3144         int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
   3145         const DT* kx = this->kernel.template ptr<DT>() + ksize2;
   3146         bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
   3147         DT* D = (DT*)dst;
   3148         int i = this->vecOp(src, dst, width, cn), j, k;
   3149         const ST* S = (const ST*)src + i + ksize2n;
   3150         width *= cn;
   3151 
   3152         if( symmetrical )
   3153         {
   3154             if( this->ksize == 1 && kx[0] == 1 )
   3155             {
   3156                 for( ; i <= width - 2; i += 2 )
   3157                 {
   3158                     DT s0 = S[i], s1 = S[i+1];
   3159                     D[i] = s0; D[i+1] = s1;
   3160                 }
   3161                 S += i;
   3162             }
   3163             else if( this->ksize == 3 )
   3164             {
   3165                 if( kx[0] == 2 && kx[1] == 1 )
   3166                     for( ; i <= width - 2; i += 2, S += 2 )
   3167                     {
   3168                         DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
   3169                         D[i] = s0; D[i+1] = s1;
   3170                     }
   3171                 else if( kx[0] == -2 && kx[1] == 1 )
   3172                     for( ; i <= width - 2; i += 2, S += 2 )
   3173                     {
   3174                         DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
   3175                         D[i] = s0; D[i+1] = s1;
   3176                     }
   3177                 else
   3178                 {
   3179                     DT k0 = kx[0], k1 = kx[1];
   3180                     for( ; i <= width - 2; i += 2, S += 2 )
   3181                     {
   3182                         DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
   3183                         D[i] = s0; D[i+1] = s1;
   3184                     }
   3185                 }
   3186             }
   3187             else if( this->ksize == 5 )
   3188             {
   3189                 DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
   3190                 if( k0 == -2 && k1 == 0 && k2 == 1 )
   3191                     for( ; i <= width - 2; i += 2, S += 2 )
   3192                     {
   3193                         DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
   3194                         DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
   3195                         D[i] = s0; D[i+1] = s1;
   3196                     }
   3197                 else
   3198                     for( ; i <= width - 2; i += 2, S += 2 )
   3199                     {
   3200                         DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
   3201                         DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
   3202                         D[i] = s0; D[i+1] = s1;
   3203                     }
   3204             }
   3205 
   3206             for( ; i < width; i++, S++ )
   3207             {
   3208                 DT s0 = kx[0]*S[0];
   3209                 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
   3210                     s0 += kx[k]*(S[j] + S[-j]);
   3211                 D[i] = s0;
   3212             }
   3213         }
   3214         else
   3215         {
   3216             if( this->ksize == 3 )
   3217             {
   3218                 if( kx[0] == 0 && kx[1] == 1 )
   3219                     for( ; i <= width - 2; i += 2, S += 2 )
   3220                     {
   3221                         DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
   3222                         D[i] = s0; D[i+1] = s1;
   3223                     }
   3224                 else
   3225                 {
   3226                     DT k1 = kx[1];
   3227                     for( ; i <= width - 2; i += 2, S += 2 )
   3228                     {
   3229                         DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
   3230                         D[i] = s0; D[i+1] = s1;
   3231                     }
   3232                 }
   3233             }
   3234             else if( this->ksize == 5 )
   3235             {
   3236                 DT k1 = kx[1], k2 = kx[2];
   3237                 for( ; i <= width - 2; i += 2, S += 2 )
   3238                 {
   3239                     DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
   3240                     DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
   3241                     D[i] = s0; D[i+1] = s1;
   3242                 }
   3243             }
   3244 
   3245             for( ; i < width; i++, S++ )
   3246             {
   3247                 DT s0 = kx[0]*S[0];
   3248                 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
   3249                     s0 += kx[k]*(S[j] - S[-j]);
   3250                 D[i] = s0;
   3251             }
   3252         }
   3253     }
   3254 
   3255     int symmetryType;
   3256 };
   3257 
   3258 
   3259 template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
   3260 {
   3261     typedef typename CastOp::type1 ST;
   3262     typedef typename CastOp::rtype DT;
   3263 
   3264     ColumnFilter( const Mat& _kernel, int _anchor,
   3265         double _delta, const CastOp& _castOp=CastOp(),
   3266         const VecOp& _vecOp=VecOp() )
   3267     {
   3268         if( _kernel.isContinuous() )
   3269             kernel = _kernel;
   3270         else
   3271             _kernel.copyTo(kernel);
   3272         anchor = _anchor;
   3273         ksize = kernel.rows + kernel.cols - 1;
   3274         delta = saturate_cast<ST>(_delta);
   3275         castOp0 = _castOp;
   3276         vecOp = _vecOp;
   3277         CV_Assert( kernel.type() == DataType<ST>::type &&
   3278                    (kernel.rows == 1 || kernel.cols == 1));
   3279     }
   3280 
   3281     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
   3282     {
   3283         const ST* ky = kernel.template ptr<ST>();
   3284         ST _delta = delta;
   3285         int _ksize = ksize;
   3286         int i, k;
   3287         CastOp castOp = castOp0;
   3288 
   3289         for( ; count--; dst += dststep, src++ )
   3290         {
   3291             DT* D = (DT*)dst;
   3292             i = vecOp(src, dst, width);
   3293             #if CV_ENABLE_UNROLLED
   3294             for( ; i <= width - 4; i += 4 )
   3295             {
   3296                 ST f = ky[0];
   3297                 const ST* S = (const ST*)src[0] + i;
   3298                 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
   3299                     s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
   3300 
   3301                 for( k = 1; k < _ksize; k++ )
   3302                 {
   3303                     S = (const ST*)src[k] + i; f = ky[k];
   3304                     s0 += f*S[0]; s1 += f*S[1];
   3305                     s2 += f*S[2]; s3 += f*S[3];
   3306                 }
   3307 
   3308                 D[i] = castOp(s0); D[i+1] = castOp(s1);
   3309                 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
   3310             }
   3311             #endif
   3312             for( ; i < width; i++ )
   3313             {
   3314                 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
   3315                 for( k = 1; k < _ksize; k++ )
   3316                     s0 += ky[k]*((const ST*)src[k])[i];
   3317                 D[i] = castOp(s0);
   3318             }
   3319         }
   3320     }
   3321 
   3322     Mat kernel;
   3323     CastOp castOp0;
   3324     VecOp vecOp;
   3325     ST delta;
   3326 };
   3327 
   3328 
   3329 template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
   3330 {
   3331     typedef typename CastOp::type1 ST;
   3332     typedef typename CastOp::rtype DT;
   3333 
   3334     SymmColumnFilter( const Mat& _kernel, int _anchor,
   3335         double _delta, int _symmetryType,
   3336         const CastOp& _castOp=CastOp(),
   3337         const VecOp& _vecOp=VecOp())
   3338         : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
   3339     {
   3340         symmetryType = _symmetryType;
   3341         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
   3342     }
   3343 
   3344     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
   3345     {
   3346         int ksize2 = this->ksize/2;
   3347         const ST* ky = this->kernel.template ptr<ST>() + ksize2;
   3348         int i, k;
   3349         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
   3350         ST _delta = this->delta;
   3351         CastOp castOp = this->castOp0;
   3352         src += ksize2;
   3353 
   3354         if( symmetrical )
   3355         {
   3356             for( ; count--; dst += dststep, src++ )
   3357             {
   3358                 DT* D = (DT*)dst;
   3359                 i = (this->vecOp)(src, dst, width);
   3360                 #if CV_ENABLE_UNROLLED
   3361                 for( ; i <= width - 4; i += 4 )
   3362                 {
   3363                     ST f = ky[0];
   3364                     const ST* S = (const ST*)src[0] + i, *S2;
   3365                     ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
   3366                         s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
   3367 
   3368                     for( k = 1; k <= ksize2; k++ )
   3369                     {
   3370                         S = (const ST*)src[k] + i;
   3371                         S2 = (const ST*)src[-k] + i;
   3372                         f = ky[k];
   3373                         s0 += f*(S[0] + S2[0]);
   3374                         s1 += f*(S[1] + S2[1]);
   3375                         s2 += f*(S[2] + S2[2]);
   3376                         s3 += f*(S[3] + S2[3]);
   3377                     }
   3378 
   3379                     D[i] = castOp(s0); D[i+1] = castOp(s1);
   3380                     D[i+2] = castOp(s2); D[i+3] = castOp(s3);
   3381                 }
   3382                 #endif
   3383                 for( ; i < width; i++ )
   3384                 {
   3385                     ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
   3386                     for( k = 1; k <= ksize2; k++ )
   3387                         s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
   3388                     D[i] = castOp(s0);
   3389                 }
   3390             }
   3391         }
   3392         else
   3393         {
   3394             for( ; count--; dst += dststep, src++ )
   3395             {
   3396                 DT* D = (DT*)dst;
   3397                 i = this->vecOp(src, dst, width);
   3398                 #if CV_ENABLE_UNROLLED
   3399                 for( ; i <= width - 4; i += 4 )
   3400                 {
   3401                     ST f = ky[0];
   3402                     const ST *S, *S2;
   3403                     ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
   3404 
   3405                     for( k = 1; k <= ksize2; k++ )
   3406                     {
   3407                         S = (const ST*)src[k] + i;
   3408                         S2 = (const ST*)src[-k] + i;
   3409                         f = ky[k];
   3410                         s0 += f*(S[0] - S2[0]);
   3411                         s1 += f*(S[1] - S2[1]);
   3412                         s2 += f*(S[2] - S2[2]);
   3413                         s3 += f*(S[3] - S2[3]);
   3414                     }
   3415 
   3416                     D[i] = castOp(s0); D[i+1] = castOp(s1);
   3417                     D[i+2] = castOp(s2); D[i+3] = castOp(s3);
   3418                 }
   3419                 #endif
   3420                 for( ; i < width; i++ )
   3421                 {
   3422                     ST s0 = _delta;
   3423                     for( k = 1; k <= ksize2; k++ )
   3424                         s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
   3425                     D[i] = castOp(s0);
   3426                 }
   3427             }
   3428         }
   3429     }
   3430 
   3431     int symmetryType;
   3432 };
   3433 
   3434 
   3435 template<class CastOp, class VecOp>
   3436 struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
   3437 {
   3438     typedef typename CastOp::type1 ST;
   3439     typedef typename CastOp::rtype DT;
   3440 
   3441     SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
   3442                            double _delta, int _symmetryType,
   3443                            const CastOp& _castOp=CastOp(),
   3444                            const VecOp& _vecOp=VecOp())
   3445         : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
   3446     {
   3447         CV_Assert( this->ksize == 3 );
   3448     }
   3449 
   3450     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
   3451     {
   3452         int ksize2 = this->ksize/2;
   3453         const ST* ky = this->kernel.template ptr<ST>() + ksize2;
   3454         int i;
   3455         bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
   3456         bool is_1_2_1 = ky[0] == 2 && ky[1] == 1;
   3457         bool is_1_m2_1 = ky[0] == -2 && ky[1] == 1;
   3458         bool is_m1_0_1 = ky[0] == 0 && (ky[1] == 1 || ky[1] == -1);
   3459         ST f0 = ky[0], f1 = ky[1];
   3460         ST _delta = this->delta;
   3461         CastOp castOp = this->castOp0;
   3462         src += ksize2;
   3463 
   3464         for( ; count--; dst += dststep, src++ )
   3465         {
   3466             DT* D = (DT*)dst;
   3467             i = (this->vecOp)(src, dst, width);
   3468             const ST* S0 = (const ST*)src[-1];
   3469             const ST* S1 = (const ST*)src[0];
   3470             const ST* S2 = (const ST*)src[1];
   3471 
   3472             if( symmetrical )
   3473             {
   3474                 if( is_1_2_1 )
   3475                 {
   3476                     #if CV_ENABLE_UNROLLED
   3477                     for( ; i <= width - 4; i += 4 )
   3478                     {
   3479                         ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
   3480                         ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
   3481                         D[i] = castOp(s0);
   3482                         D[i+1] = castOp(s1);
   3483 
   3484                         s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
   3485                         s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
   3486                         D[i+2] = castOp(s0);
   3487                         D[i+3] = castOp(s1);
   3488                     }
   3489                     #endif
   3490                     for( ; i < width; i ++ )
   3491                     {
   3492                         ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
   3493                         D[i] = castOp(s0);
   3494                     }
   3495                 }
   3496                 else if( is_1_m2_1 )
   3497                 {
   3498                     #if CV_ENABLE_UNROLLED
   3499                     for( ; i <= width - 4; i += 4 )
   3500                     {
   3501                         ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
   3502                         ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
   3503                         D[i] = castOp(s0);
   3504                         D[i+1] = castOp(s1);
   3505 
   3506                         s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
   3507                         s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
   3508                         D[i+2] = castOp(s0);
   3509                         D[i+3] = castOp(s1);
   3510                     }
   3511                     #endif
   3512                     for( ; i < width; i ++ )
   3513                     {
   3514                         ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
   3515                         D[i] = castOp(s0);
   3516                     }
   3517                 }
   3518                 else
   3519                 {
   3520                     #if CV_ENABLE_UNROLLED
   3521                     for( ; i <= width - 4; i += 4 )
   3522                     {
   3523                         ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
   3524                         ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
   3525                         D[i] = castOp(s0);
   3526                         D[i+1] = castOp(s1);
   3527 
   3528                         s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
   3529                         s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
   3530                         D[i+2] = castOp(s0);
   3531                         D[i+3] = castOp(s1);
   3532                     }
   3533                     #endif
   3534                     for( ; i < width; i ++ )
   3535                     {
   3536                         ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
   3537                         D[i] = castOp(s0);
   3538                     }
   3539                 }
   3540             }
   3541             else
   3542             {
   3543                 if( is_m1_0_1 )
   3544                 {
   3545                     if( f1 < 0 )
   3546                         std::swap(S0, S2);
   3547                     #if CV_ENABLE_UNROLLED
   3548                     for( ; i <= width - 4; i += 4 )
   3549                     {
   3550                         ST s0 = S2[i] - S0[i] + _delta;
   3551                         ST s1 = S2[i+1] - S0[i+1] + _delta;
   3552                         D[i] = castOp(s0);
   3553                         D[i+1] = castOp(s1);
   3554 
   3555                         s0 = S2[i+2] - S0[i+2] + _delta;
   3556                         s1 = S2[i+3] - S0[i+3] + _delta;
   3557                         D[i+2] = castOp(s0);
   3558                         D[i+3] = castOp(s1);
   3559                     }
   3560                     #endif
   3561                     for( ; i < width; i ++ )
   3562                     {
   3563                         ST s0 = S2[i] - S0[i] + _delta;
   3564                         D[i] = castOp(s0);
   3565                     }
   3566                     if( f1 < 0 )
   3567                         std::swap(S0, S2);
   3568                 }
   3569                 else
   3570                 {
   3571                     #if CV_ENABLE_UNROLLED
   3572                     for( ; i <= width - 4; i += 4 )
   3573                     {
   3574                         ST s0 = (S2[i] - S0[i])*f1 + _delta;
   3575                         ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
   3576                         D[i] = castOp(s0);
   3577                         D[i+1] = castOp(s1);
   3578 
   3579                         s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
   3580                         s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
   3581                         D[i+2] = castOp(s0);
   3582                         D[i+3] = castOp(s1);
   3583                     }
   3584                     #endif
   3585                     for( ; i < width; i++ )
   3586                         D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
   3587                 }
   3588             }
   3589         }
   3590     }
   3591 };
   3592 
   3593 template<typename ST, typename DT> struct Cast
   3594 {
   3595     typedef ST type1;
   3596     typedef DT rtype;
   3597 
   3598     DT operator()(ST val) const { return saturate_cast<DT>(val); }
   3599 };
   3600 
   3601 template<typename ST, typename DT, int bits> struct FixedPtCast
   3602 {
   3603     typedef ST type1;
   3604     typedef DT rtype;
   3605     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
   3606 
   3607     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
   3608 };
   3609 
   3610 template<typename ST, typename DT> struct FixedPtCastEx
   3611 {
   3612     typedef ST type1;
   3613     typedef DT rtype;
   3614 
   3615     FixedPtCastEx() : SHIFT(0), DELTA(0) {}
   3616     FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
   3617     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
   3618     int SHIFT, DELTA;
   3619 };
   3620 
   3621 }
   3622 
   3623 cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType,
   3624                                                    InputArray _kernel, int anchor,
   3625                                                    int symmetryType )
   3626 {
   3627     Mat kernel = _kernel.getMat();
   3628     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
   3629     int cn = CV_MAT_CN(srcType);
   3630     CV_Assert( cn == CV_MAT_CN(bufType) &&
   3631         ddepth >= std::max(sdepth, CV_32S) &&
   3632         kernel.type() == ddepth );
   3633     int ksize = kernel.rows + kernel.cols - 1;
   3634 
   3635     if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
   3636     {
   3637         if( sdepth == CV_8U && ddepth == CV_32S )
   3638             return makePtr<SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s> >
   3639                 (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType));
   3640         if( sdepth == CV_32F && ddepth == CV_32F )
   3641             return makePtr<SymmRowSmallFilter<float, float, SymmRowSmallVec_32f> >
   3642                 (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType));
   3643     }
   3644 
   3645     if( sdepth == CV_8U && ddepth == CV_32S )
   3646         return makePtr<RowFilter<uchar, int, RowVec_8u32s> >
   3647             (kernel, anchor, RowVec_8u32s(kernel));
   3648     if( sdepth == CV_8U && ddepth == CV_32F )
   3649         return makePtr<RowFilter<uchar, float, RowNoVec> >(kernel, anchor);
   3650     if( sdepth == CV_8U && ddepth == CV_64F )
   3651         return makePtr<RowFilter<uchar, double, RowNoVec> >(kernel, anchor);
   3652     if( sdepth == CV_16U && ddepth == CV_32F )
   3653         return makePtr<RowFilter<ushort, float, RowNoVec> >(kernel, anchor);
   3654     if( sdepth == CV_16U && ddepth == CV_64F )
   3655         return makePtr<RowFilter<ushort, double, RowNoVec> >(kernel, anchor);
   3656     if( sdepth == CV_16S && ddepth == CV_32F )
   3657         return makePtr<RowFilter<short, float, RowVec_16s32f> >
   3658                                   (kernel, anchor, RowVec_16s32f(kernel));
   3659     if( sdepth == CV_16S && ddepth == CV_64F )
   3660         return makePtr<RowFilter<short, double, RowNoVec> >(kernel, anchor);
   3661     if( sdepth == CV_32F && ddepth == CV_32F )
   3662         return makePtr<RowFilter<float, float, RowVec_32f> >
   3663             (kernel, anchor, RowVec_32f(kernel));
   3664     if( sdepth == CV_32F && ddepth == CV_64F )
   3665         return makePtr<RowFilter<float, double, RowNoVec> >(kernel, anchor);
   3666     if( sdepth == CV_64F && ddepth == CV_64F )
   3667         return makePtr<RowFilter<double, double, RowNoVec> >(kernel, anchor);
   3668 
   3669     CV_Error_( CV_StsNotImplemented,
   3670         ("Unsupported combination of source format (=%d), and buffer format (=%d)",
   3671         srcType, bufType));
   3672 
   3673     return Ptr<BaseRowFilter>();
   3674 }
   3675 
   3676 
   3677 cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstType,
   3678                                              InputArray _kernel, int anchor,
   3679                                              int symmetryType, double delta,
   3680                                              int bits )
   3681 {
   3682     Mat kernel = _kernel.getMat();
   3683     int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
   3684     int cn = CV_MAT_CN(dstType);
   3685     CV_Assert( cn == CV_MAT_CN(bufType) &&
   3686         sdepth >= std::max(ddepth, CV_32S) &&
   3687         kernel.type() == sdepth );
   3688 
   3689     if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
   3690     {
   3691         if( ddepth == CV_8U && sdepth == CV_32S )
   3692             return makePtr<ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec> >
   3693             (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits));
   3694         if( ddepth == CV_8U && sdepth == CV_32F )
   3695             return makePtr<ColumnFilter<Cast<float, uchar>, ColumnNoVec> >(kernel, anchor, delta);
   3696         if( ddepth == CV_8U && sdepth == CV_64F )
   3697             return makePtr<ColumnFilter<Cast<double, uchar>, ColumnNoVec> >(kernel, anchor, delta);
   3698         if( ddepth == CV_16U && sdepth == CV_32F )
   3699             return makePtr<ColumnFilter<Cast<float, ushort>, ColumnNoVec> >(kernel, anchor, delta);
   3700         if( ddepth == CV_16U && sdepth == CV_64F )
   3701             return makePtr<ColumnFilter<Cast<double, ushort>, ColumnNoVec> >(kernel, anchor, delta);
   3702         if( ddepth == CV_16S && sdepth == CV_32F )
   3703             return makePtr<ColumnFilter<Cast<float, short>, ColumnNoVec> >(kernel, anchor, delta);
   3704         if( ddepth == CV_16S && sdepth == CV_64F )
   3705             return makePtr<ColumnFilter<Cast<double, short>, ColumnNoVec> >(kernel, anchor, delta);
   3706         if( ddepth == CV_32F && sdepth == CV_32F )
   3707             return makePtr<ColumnFilter<Cast<float, float>, ColumnNoVec> >(kernel, anchor, delta);
   3708         if( ddepth == CV_64F && sdepth == CV_64F )
   3709             return makePtr<ColumnFilter<Cast<double, double>, ColumnNoVec> >(kernel, anchor, delta);
   3710     }
   3711     else
   3712     {
   3713         int ksize = kernel.rows + kernel.cols - 1;
   3714         if( ksize == 3 )
   3715         {
   3716             if( ddepth == CV_8U && sdepth == CV_32S )
   3717                 return makePtr<SymmColumnSmallFilter<
   3718                     FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> >
   3719                     (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
   3720                     SymmColumnVec_32s8u(kernel, symmetryType, bits, delta));
   3721             if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
   3722                 return makePtr<SymmColumnSmallFilter<Cast<int, short>,
   3723                     SymmColumnSmallVec_32s16s> >(kernel, anchor, delta, symmetryType,
   3724                         Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta));
   3725             if( ddepth == CV_32F && sdepth == CV_32F )
   3726                 return makePtr<SymmColumnSmallFilter<
   3727                     Cast<float, float>,SymmColumnSmallVec_32f> >
   3728                     (kernel, anchor, delta, symmetryType, Cast<float, float>(),
   3729                     SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta));
   3730         }
   3731         if( ddepth == CV_8U && sdepth == CV_32S )
   3732             return makePtr<SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> >
   3733                 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
   3734                 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta));
   3735         if( ddepth == CV_8U && sdepth == CV_32F )
   3736             return makePtr<SymmColumnFilter<Cast<float, uchar>, ColumnNoVec> >
   3737                 (kernel, anchor, delta, symmetryType);
   3738         if( ddepth == CV_8U && sdepth == CV_64F )
   3739             return makePtr<SymmColumnFilter<Cast<double, uchar>, ColumnNoVec> >
   3740                 (kernel, anchor, delta, symmetryType);
   3741         if( ddepth == CV_16U && sdepth == CV_32F )
   3742             return makePtr<SymmColumnFilter<Cast<float, ushort>, ColumnNoVec> >
   3743                 (kernel, anchor, delta, symmetryType);
   3744         if( ddepth == CV_16U && sdepth == CV_64F )
   3745             return makePtr<SymmColumnFilter<Cast<double, ushort>, ColumnNoVec> >
   3746                 (kernel, anchor, delta, symmetryType);
   3747         if( ddepth == CV_16S && sdepth == CV_32S )
   3748             return makePtr<SymmColumnFilter<Cast<int, short>, ColumnNoVec> >
   3749                 (kernel, anchor, delta, symmetryType);
   3750         if( ddepth == CV_16S && sdepth == CV_32F )
   3751             return makePtr<SymmColumnFilter<Cast<float, short>, SymmColumnVec_32f16s> >
   3752                  (kernel, anchor, delta, symmetryType, Cast<float, short>(),
   3753                   SymmColumnVec_32f16s(kernel, symmetryType, 0, delta));
   3754         if( ddepth == CV_16S && sdepth == CV_64F )
   3755             return makePtr<SymmColumnFilter<Cast<double, short>, ColumnNoVec> >
   3756                 (kernel, anchor, delta, symmetryType);
   3757         if( ddepth == CV_32F && sdepth == CV_32F )
   3758             return makePtr<SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f> >
   3759                 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
   3760                 SymmColumnVec_32f(kernel, symmetryType, 0, delta));
   3761         if( ddepth == CV_64F && sdepth == CV_64F )
   3762             return makePtr<SymmColumnFilter<Cast<double, double>, ColumnNoVec> >
   3763                 (kernel, anchor, delta, symmetryType);
   3764     }
   3765 
   3766     CV_Error_( CV_StsNotImplemented,
   3767         ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
   3768         bufType, dstType));
   3769 
   3770     return Ptr<BaseColumnFilter>();
   3771 }
   3772 
   3773 
   3774 cv::Ptr<cv::FilterEngine> cv::createSeparableLinearFilter(
   3775     int _srcType, int _dstType,
   3776     InputArray __rowKernel, InputArray __columnKernel,
   3777     Point _anchor, double _delta,
   3778     int _rowBorderType, int _columnBorderType,
   3779     const Scalar& _borderValue )
   3780 {
   3781     Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat();
   3782     _srcType = CV_MAT_TYPE(_srcType);
   3783     _dstType = CV_MAT_TYPE(_dstType);
   3784     int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
   3785     int cn = CV_MAT_CN(_srcType);
   3786     CV_Assert( cn == CV_MAT_CN(_dstType) );
   3787     int rsize = _rowKernel.rows + _rowKernel.cols - 1;
   3788     int csize = _columnKernel.rows + _columnKernel.cols - 1;
   3789     if( _anchor.x < 0 )
   3790         _anchor.x = rsize/2;
   3791     if( _anchor.y < 0 )
   3792         _anchor.y = csize/2;
   3793     int rtype = getKernelType(_rowKernel,
   3794         _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
   3795     int ctype = getKernelType(_columnKernel,
   3796         _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
   3797     Mat rowKernel, columnKernel;
   3798 
   3799     int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
   3800     int bits = 0;
   3801 
   3802     if( sdepth == CV_8U &&
   3803         ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
   3804           ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
   3805           ddepth == CV_8U) ||
   3806          ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
   3807           (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
   3808           (rtype & ctype & KERNEL_INTEGER) &&
   3809           ddepth == CV_16S)) )
   3810     {
   3811         bdepth = CV_32S;
   3812         bits = ddepth == CV_8U ? 8 : 0;
   3813         _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
   3814         _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
   3815         bits *= 2;
   3816         _delta *= (1 << bits);
   3817     }
   3818     else
   3819     {
   3820         if( _rowKernel.type() != bdepth )
   3821             _rowKernel.convertTo( rowKernel, bdepth );
   3822         else
   3823             rowKernel = _rowKernel;
   3824         if( _columnKernel.type() != bdepth )
   3825             _columnKernel.convertTo( columnKernel, bdepth );
   3826         else
   3827             columnKernel = _columnKernel;
   3828     }
   3829 
   3830     int _bufType = CV_MAKETYPE(bdepth, cn);
   3831     Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
   3832         _srcType, _bufType, rowKernel, _anchor.x, rtype);
   3833     Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
   3834         _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
   3835 
   3836     return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(), _rowFilter, _columnFilter,
   3837         _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
   3838 }
   3839 
   3840 
   3841 /****************************************************************************************\
   3842 *                               Non-separable linear filter                              *
   3843 \****************************************************************************************/
   3844 
   3845 namespace cv
   3846 {
   3847 
   3848 void preprocess2DKernel( const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs )
   3849 {
   3850     int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
   3851     if(nz == 0)
   3852         nz = 1;
   3853     CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
   3854     coords.resize(nz);
   3855     coeffs.resize(nz*getElemSize(ktype));
   3856     uchar* _coeffs = &coeffs[0];
   3857 
   3858     for( i = k = 0; i < kernel.rows; i++ )
   3859     {
   3860         const uchar* krow = kernel.ptr(i);
   3861         for( j = 0; j < kernel.cols; j++ )
   3862         {
   3863             if( ktype == CV_8U )
   3864             {
   3865                 uchar val = krow[j];
   3866                 if( val == 0 )
   3867                     continue;
   3868                 coords[k] = Point(j,i);
   3869                 _coeffs[k++] = val;
   3870             }
   3871             else if( ktype == CV_32S )
   3872             {
   3873                 int val = ((const int*)krow)[j];
   3874                 if( val == 0 )
   3875                     continue;
   3876                 coords[k] = Point(j,i);
   3877                 ((int*)_coeffs)[k++] = val;
   3878             }
   3879             else if( ktype == CV_32F )
   3880             {
   3881                 float val = ((const float*)krow)[j];
   3882                 if( val == 0 )
   3883                     continue;
   3884                 coords[k] = Point(j,i);
   3885                 ((float*)_coeffs)[k++] = val;
   3886             }
   3887             else
   3888             {
   3889                 double val = ((const double*)krow)[j];
   3890                 if( val == 0 )
   3891                     continue;
   3892                 coords[k] = Point(j,i);
   3893                 ((double*)_coeffs)[k++] = val;
   3894             }
   3895         }
   3896     }
   3897 }
   3898 
   3899 
   3900 template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
   3901 {
   3902     typedef typename CastOp::type1 KT;
   3903     typedef typename CastOp::rtype DT;
   3904 
   3905     Filter2D( const Mat& _kernel, Point _anchor,
   3906         double _delta, const CastOp& _castOp=CastOp(),
   3907         const VecOp& _vecOp=VecOp() )
   3908     {
   3909         anchor = _anchor;
   3910         ksize = _kernel.size();
   3911         delta = saturate_cast<KT>(_delta);
   3912         castOp0 = _castOp;
   3913         vecOp = _vecOp;
   3914         CV_Assert( _kernel.type() == DataType<KT>::type );
   3915         preprocess2DKernel( _kernel, coords, coeffs );
   3916         ptrs.resize( coords.size() );
   3917     }
   3918 
   3919     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
   3920     {
   3921         KT _delta = delta;
   3922         const Point* pt = &coords[0];
   3923         const KT* kf = (const KT*)&coeffs[0];
   3924         const ST** kp = (const ST**)&ptrs[0];
   3925         int i, k, nz = (int)coords.size();
   3926         CastOp castOp = castOp0;
   3927 
   3928         width *= cn;
   3929         for( ; count > 0; count--, dst += dststep, src++ )
   3930         {
   3931             DT* D = (DT*)dst;
   3932 
   3933             for( k = 0; k < nz; k++ )
   3934                 kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
   3935 
   3936             i = vecOp((const uchar**)kp, dst, width);
   3937             #if CV_ENABLE_UNROLLED
   3938             for( ; i <= width - 4; i += 4 )
   3939             {
   3940                 KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
   3941 
   3942                 for( k = 0; k < nz; k++ )
   3943                 {
   3944                     const ST* sptr = kp[k] + i;
   3945                     KT f = kf[k];
   3946                     s0 += f*sptr[0];
   3947                     s1 += f*sptr[1];
   3948                     s2 += f*sptr[2];
   3949                     s3 += f*sptr[3];
   3950                 }
   3951 
   3952                 D[i] = castOp(s0); D[i+1] = castOp(s1);
   3953                 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
   3954             }
   3955             #endif
   3956             for( ; i < width; i++ )
   3957             {
   3958                 KT s0 = _delta;
   3959                 for( k = 0; k < nz; k++ )
   3960                     s0 += kf[k]*kp[k][i];
   3961                 D[i] = castOp(s0);
   3962             }
   3963         }
   3964     }
   3965 
   3966     std::vector<Point> coords;
   3967     std::vector<uchar> coeffs;
   3968     std::vector<uchar*> ptrs;
   3969     KT delta;
   3970     CastOp castOp0;
   3971     VecOp vecOp;
   3972 };
   3973 
   3974 #ifdef HAVE_OPENCL
   3975 
   3976 #define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
   3977 #define ROUNDUP(sz, n)      ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
   3978 
   3979 // prepare kernel: transpose and make double rows (+align). Returns size of aligned row
   3980 // Samples:
   3981 //        a b c
   3982 // Input: d e f
   3983 //        g h i
   3984 // Output, last two zeros is the alignment:
   3985 // a d g a d g 0 0
   3986 // b e h b e h 0 0
   3987 // c f i c f i 0 0
   3988 template <typename T>
   3989 static int _prepareKernelFilter2D(std::vector<T> & data, const Mat & kernel)
   3990 {
   3991     Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
   3992     int size_y_aligned = ROUNDUP(kernel.rows * 2, 4);
   3993     data.clear(); data.resize(size_y_aligned * kernel.cols, 0);
   3994     for (int x = 0; x < kernel.cols; x++)
   3995     {
   3996         for (int y = 0; y < kernel.rows; y++)
   3997         {
   3998             data[x * size_y_aligned + y] = _kernel.at<T>(y, x);
   3999             data[x * size_y_aligned + y + kernel.rows] = _kernel.at<T>(y, x);
   4000         }
   4001     }
   4002     return size_y_aligned;
   4003 }
   4004 
   4005 static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
   4006                    InputArray _kernel, Point anchor,
   4007                    double delta, int borderType )
   4008 {
   4009     int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   4010     ddepth = ddepth < 0 ? sdepth : ddepth;
   4011     int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
   4012             wtype = CV_MAKE_TYPE(wdepth, cn);
   4013     if (cn > 4)
   4014         return false;
   4015 
   4016     Size ksize = _kernel.size();
   4017     if (anchor.x < 0)
   4018         anchor.x = ksize.width / 2;
   4019     if (anchor.y < 0)
   4020         anchor.y = ksize.height / 2;
   4021 
   4022     bool isolated = (borderType & BORDER_ISOLATED) != 0;
   4023     borderType &= ~BORDER_ISOLATED;
   4024     const cv::ocl::Device &device = cv::ocl::Device::getDefault();
   4025     bool doubleSupport = device.doubleFPConfig() > 0;
   4026     if (wdepth == CV_64F && !doubleSupport)
   4027         return false;
   4028 
   4029     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT",
   4030                                        "BORDER_WRAP", "BORDER_REFLECT_101" };
   4031 
   4032     cv::Mat kernelMat = _kernel.getMat();
   4033     cv::Size sz = _src.size(), wholeSize;
   4034     size_t globalsize[2] = { sz.width, sz.height };
   4035     size_t localsize_general[2] = {0, 1};
   4036     size_t* localsize = NULL;
   4037 
   4038     ocl::Kernel k;
   4039     UMat src = _src.getUMat();
   4040     if (!isolated)
   4041     {
   4042         Point ofs;
   4043         src.locateROI(wholeSize, ofs);
   4044     }
   4045 
   4046     size_t tryWorkItems = device.maxWorkGroupSize();
   4047     if (device.isIntel() && 128 < tryWorkItems)
   4048         tryWorkItems = 128;
   4049     char cvt[2][40];
   4050 
   4051     // For smaller filter kernels, there is a special kernel that is more
   4052     // efficient than the general one.
   4053     UMat kernalDataUMat;
   4054     if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) &&
   4055         ((ksize.width < 5 && ksize.height < 5) ||
   4056         (ksize.width == 5 && ksize.height == 5 && cn == 1)))
   4057     {
   4058         kernelMat = kernelMat.reshape(0, 1);
   4059         String kerStr = ocl::kernelToStr(kernelMat, CV_32F);
   4060         int h = isolated ? sz.height : wholeSize.height;
   4061         int w = isolated ? sz.width : wholeSize.width;
   4062 
   4063         if (w < ksize.width || h < ksize.height)
   4064             return false;
   4065 
   4066         // Figure out what vector size to use for loading the pixels.
   4067         int pxLoadNumPixels = cn != 1 || sz.width % 4 ? 1 : 4;
   4068         int pxLoadVecSize = cn * pxLoadNumPixels;
   4069 
   4070         // Figure out how many pixels per work item to compute in X and Y
   4071         // directions.  Too many and we run out of registers.
   4072         int pxPerWorkItemX = 1;
   4073         int pxPerWorkItemY = 1;
   4074         if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
   4075         {
   4076             pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8;
   4077             pxPerWorkItemY = sz.height % 2 ? 1 : 2;
   4078         }
   4079         else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
   4080         {
   4081             pxPerWorkItemX = sz.width % 2 ? 1 : 2;
   4082             pxPerWorkItemY = sz.height % 2 ? 1 : 2;
   4083         }
   4084         globalsize[0] = sz.width / pxPerWorkItemX;
   4085         globalsize[1] = sz.height / pxPerWorkItemY;
   4086 
   4087         // Need some padding in the private array for pixels
   4088         int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
   4089 
   4090         // Make the global size a nice round number so the runtime can pick
   4091         // from reasonable choices for the workgroup size
   4092         const int wgRound = 256;
   4093         globalsize[0] = ROUNDUP(globalsize[0], wgRound);
   4094 
   4095         char build_options[1024];
   4096         sprintf(build_options, "-D cn=%d "
   4097                 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
   4098                 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
   4099                 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
   4100                 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
   4101                 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
   4102                 "-D convertToWT=%s -D convertToDstT=%s %s",
   4103                 cn, anchor.x, anchor.y, ksize.width, ksize.height,
   4104                 pxLoadVecSize, pxLoadNumPixels,
   4105                 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
   4106                 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
   4107                 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
   4108                 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
   4109                 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
   4110                 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
   4111                 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str());
   4112 
   4113         if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options))
   4114             return false;
   4115     }
   4116     else
   4117     {
   4118         localsize = localsize_general;
   4119         std::vector<float> kernelMatDataFloat;
   4120         int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
   4121         String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F);
   4122 
   4123         for ( ; ; )
   4124         {
   4125             size_t BLOCK_SIZE = tryWorkItems;
   4126             while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
   4127                 BLOCK_SIZE /= 2;
   4128 
   4129             if ((size_t)ksize.width > BLOCK_SIZE)
   4130                 return false;
   4131 
   4132             int requiredTop = anchor.y;
   4133             int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x;
   4134             int requiredBottom = ksize.height - 1 - anchor.y;
   4135             int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
   4136             int h = isolated ? sz.height : wholeSize.height;
   4137             int w = isolated ? sz.width : wholeSize.width;
   4138             bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
   4139 
   4140             if ((w < ksize.width) || (h < ksize.height))
   4141                 return false;
   4142 
   4143             String opts = format("-D LOCAL_SIZE=%d -D cn=%d "
   4144                                  "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
   4145                                  "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s "
   4146                                  "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
   4147                                  "-D convertToWT=%s -D convertToDstT=%s",
   4148                                  (int)BLOCK_SIZE, cn, anchor.x, anchor.y,
   4149                                  ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType],
   4150                                  extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
   4151                                  isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
   4152                                  doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(),
   4153                                  ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
   4154                                  ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
   4155                                  ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
   4156                                  ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]));
   4157 
   4158             localsize[0] = BLOCK_SIZE;
   4159             globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE;
   4160             globalsize[1] = sz.height;
   4161 
   4162             if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts))
   4163                 return false;
   4164 
   4165             size_t kernelWorkGroupSize = k.workGroupSize();
   4166             if (localsize[0] <= kernelWorkGroupSize)
   4167                 break;
   4168             if (BLOCK_SIZE < kernelWorkGroupSize)
   4169                 return false;
   4170             tryWorkItems = kernelWorkGroupSize;
   4171         }
   4172     }
   4173 
   4174     _dst.create(sz, dtype);
   4175     UMat dst = _dst.getUMat();
   4176 
   4177     int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
   4178     int srcOffsetY = (int)(src.offset / src.step);
   4179     int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width);
   4180     int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height);
   4181 
   4182     k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY,
   4183            srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta);
   4184 
   4185     return k.run(2, globalsize, localsize, false);
   4186 }
   4187 
   4188 const int shift_bits = 8;
   4189 
   4190 static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
   4191                                int borderType, int ddepth, bool fast8uc1, bool int_arithm)
   4192 {
   4193     int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
   4194     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
   4195     Size bufSize = buf.size();
   4196     int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
   4197 
   4198     if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
   4199         return false;
   4200 
   4201 #ifdef ANDROID
   4202     size_t localsize[2] = {16, 10};
   4203 #else
   4204     size_t localsize[2] = {16, 16};
   4205 #endif
   4206 
   4207     size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
   4208     if (fast8uc1)
   4209         globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
   4210 
   4211     int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1;
   4212 
   4213     bool isolated = (borderType & BORDER_ISOLATED) != 0;
   4214     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" },
   4215         * const btype = borderMap[borderType & ~BORDER_ISOLATED];
   4216 
   4217     bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1;
   4218     extra_extrapolation |= src.rows < radiusY;
   4219     extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1;
   4220     extra_extrapolation |= src.cols < radiusX;
   4221 
   4222     char cvt[40];
   4223     cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
   4224                                           " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s",
   4225                                           radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
   4226                                           extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
   4227                                           isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
   4228                                           ocl::typeToStr(type), ocl::typeToStr(buf_type),
   4229                                           ocl::convertTypeStr(sdepth, bdepth, cn, cvt),
   4230                                           ocl::typeToStr(sdepth), ocl::typeToStr(bdepth),
   4231                                           doubleSupport ? " -D DOUBLE_SUPPORT" : "",
   4232                                           int_arithm ? " -D INTEGER_ARITHMETIC" : "");
   4233     build_options += ocl::kernelToStr(kernelX, bdepth);
   4234 
   4235     Size srcWholeSize; Point srcOffset;
   4236     src.locateROI(srcWholeSize, srcOffset);
   4237 
   4238     String kernelName("row_filter");
   4239     if (fast8uc1)
   4240         kernelName += "_C1_D0";
   4241 
   4242     ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
   4243                   build_options);
   4244     if (k.empty())
   4245         return false;
   4246 
   4247     if (fast8uc1)
   4248         k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
   4249                srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
   4250                ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
   4251                buf.cols, buf.rows, radiusY);
   4252     else
   4253         k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x,
   4254                srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
   4255                ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY);
   4256 
   4257     return k.run(2, globalsize, localsize, false);
   4258 }
   4259 
   4260 static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
   4261 {
   4262     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
   4263     if (dst.depth() == CV_64F && !doubleSupport)
   4264         return false;
   4265 
   4266 #ifdef ANDROID
   4267     size_t localsize[2] = { 16, 10 };
   4268 #else
   4269     size_t localsize[2] = { 16, 16 };
   4270 #endif
   4271     size_t globalsize[2] = { 0, 0 };
   4272 
   4273     int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
   4274     Size sz = dst.size();
   4275     int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
   4276 
   4277     globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
   4278     globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
   4279 
   4280     char cvt[40];
   4281     cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
   4282                                           " -D srcT=%s -D dstT=%s -D convertToDstT=%s"
   4283                                           " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
   4284                                           anchor, (int)localsize[0], (int)localsize[1], cn,
   4285                                           ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
   4286                                           ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
   4287                                           ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
   4288                                           2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
   4289                                           int_arithm ? " -D INTEGER_ARITHMETIC" : "");
   4290     build_options += ocl::kernelToStr(kernelY, bdepth);
   4291 
   4292     ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
   4293                   build_options);
   4294     if (k.empty())
   4295         return false;
   4296 
   4297     k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst),
   4298            static_cast<float>(delta));
   4299 
   4300     return k.run(2, globalsize, localsize, false);
   4301 }
   4302 
   4303 const int optimizedSepFilterLocalWidth  = 16;
   4304 const int optimizedSepFilterLocalHeight = 8;
   4305 
   4306 static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
   4307                                        Mat row_kernel, Mat col_kernel,
   4308                                        double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
   4309 {
   4310     Size size = _src.size(), wholeSize;
   4311     Point origin;
   4312     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
   4313             esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
   4314             dtype = CV_MAKE_TYPE(ddepth, cn);
   4315     size_t src_step = _src.step(), src_offset = _src.offset();
   4316     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
   4317 
   4318     if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
   4319             !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
   4320               borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
   4321               borderType == BORDER_REFLECT_101))
   4322         return false;
   4323 
   4324     size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
   4325     size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]};
   4326 
   4327     char cvt[2][40];
   4328     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
   4329                                        "BORDER_REFLECT_101" };
   4330 
   4331     String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
   4332                              " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
   4333                              " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
   4334                              (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
   4335                              ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
   4336                              ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
   4337                              ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
   4338                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
   4339                              ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
   4340                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth),
   4341                              cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : "");
   4342 
   4343     ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
   4344     if (k.empty())
   4345         return false;
   4346 
   4347     UMat src = _src.getUMat();
   4348     _dst.create(size, dtype);
   4349     UMat dst = _dst.getUMat();
   4350 
   4351     int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
   4352     int src_offset_y = static_cast<int>(src_offset / src_step);
   4353 
   4354     src.locateROI(wholeSize, origin);
   4355 
   4356     k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
   4357            wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst),
   4358            static_cast<float>(delta));
   4359 
   4360     return k.run(2, gt2, lt2, false);
   4361 }
   4362 
   4363 static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
   4364                       InputArray _kernelX, InputArray _kernelY, Point anchor,
   4365                       double delta, int borderType )
   4366 {
   4367     const ocl::Device & d = ocl::Device::getDefault();
   4368     Size imgSize = _src.size();
   4369 
   4370     int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   4371     if (cn > 4)
   4372         return false;
   4373 
   4374     Mat kernelX = _kernelX.getMat().reshape(1, 1);
   4375     if (kernelX.cols % 2 != 1)
   4376         return false;
   4377     Mat kernelY = _kernelY.getMat().reshape(1, 1);
   4378     if (kernelY.cols % 2 != 1)
   4379         return false;
   4380 
   4381     if (ddepth < 0)
   4382         ddepth = sdepth;
   4383 
   4384     if (anchor.x < 0)
   4385         anchor.x = kernelX.cols >> 1;
   4386     if (anchor.y < 0)
   4387         anchor.y = kernelY.cols >> 1;
   4388 
   4389     int rtype = getKernelType(kernelX,
   4390         kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
   4391     int ctype = getKernelType(kernelY,
   4392         kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
   4393 
   4394     int bdepth = CV_32F;
   4395     bool int_arithm = false;
   4396     if( sdepth == CV_8U && ddepth == CV_8U &&
   4397         rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
   4398         ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
   4399     {
   4400         if (ocl::Device::getDefault().isIntel())
   4401         {
   4402             for (int i=0; i<kernelX.cols; i++)
   4403                 kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
   4404             if (kernelX.data != kernelY.data)
   4405                 for (int i=0; i<kernelX.cols; i++)
   4406                     kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
   4407         } else
   4408         {
   4409             bdepth = CV_32S;
   4410             kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
   4411             kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
   4412         }
   4413         int_arithm = true;
   4414     }
   4415 
   4416     CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
   4417                 imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
   4418                 imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
   4419                 (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
   4420                 anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
   4421                 (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
   4422                 ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
   4423                                            borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
   4424 
   4425     UMat src = _src.getUMat();
   4426     Size srcWholeSize; Point srcOffset;
   4427     src.locateROI(srcWholeSize, srcOffset);
   4428 
   4429     bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
   4430             src.cols % 4 == 0 && src.step % 4 == 0;
   4431 
   4432     Size srcSize = src.size();
   4433     Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
   4434     UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
   4435     if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
   4436         return false;
   4437 
   4438     _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
   4439     UMat dst = _dst.getUMat();
   4440 
   4441     return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
   4442 }
   4443 
   4444 #endif
   4445 
   4446 }
   4447 
   4448 cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
   4449                                 InputArray filter_kernel, Point anchor,
   4450                                 double delta, int bits)
   4451 {
   4452     Mat _kernel = filter_kernel.getMat();
   4453     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
   4454     int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
   4455     CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
   4456 
   4457     anchor = normalizeAnchor(anchor, _kernel.size());
   4458 
   4459     /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
   4460         return makePtr<Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u> >
   4461             (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
   4462             FilterVec_8u(_kernel, bits, delta));
   4463     if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
   4464         return makePtr<Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s> >
   4465             (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
   4466             FilterVec_8u16s(_kernel, bits, delta));*/
   4467 
   4468     kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
   4469     Mat kernel;
   4470     if( _kernel.type() == kdepth )
   4471         kernel = _kernel;
   4472     else
   4473         _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
   4474 
   4475     if( sdepth == CV_8U && ddepth == CV_8U )
   4476         return makePtr<Filter2D<uchar, Cast<float, uchar>, FilterVec_8u> >
   4477             (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta));
   4478     if( sdepth == CV_8U && ddepth == CV_16U )
   4479         return makePtr<Filter2D<uchar,
   4480             Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta);
   4481     if( sdepth == CV_8U && ddepth == CV_16S )
   4482         return makePtr<Filter2D<uchar, Cast<float, short>, FilterVec_8u16s> >
   4483             (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta));
   4484     if( sdepth == CV_8U && ddepth == CV_32F )
   4485         return makePtr<Filter2D<uchar,
   4486             Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
   4487     if( sdepth == CV_8U && ddepth == CV_64F )
   4488         return makePtr<Filter2D<uchar,
   4489             Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
   4490 
   4491     if( sdepth == CV_16U && ddepth == CV_16U )
   4492         return makePtr<Filter2D<ushort,
   4493             Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta);
   4494     if( sdepth == CV_16U && ddepth == CV_32F )
   4495         return makePtr<Filter2D<ushort,
   4496             Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
   4497     if( sdepth == CV_16U && ddepth == CV_64F )
   4498         return makePtr<Filter2D<ushort,
   4499             Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
   4500 
   4501     if( sdepth == CV_16S && ddepth == CV_16S )
   4502         return makePtr<Filter2D<short,
   4503             Cast<float, short>, FilterNoVec> >(kernel, anchor, delta);
   4504     if( sdepth == CV_16S && ddepth == CV_32F )
   4505         return makePtr<Filter2D<short,
   4506             Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
   4507     if( sdepth == CV_16S && ddepth == CV_64F )
   4508         return makePtr<Filter2D<short,
   4509             Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
   4510 
   4511     if( sdepth == CV_32F && ddepth == CV_32F )
   4512         return makePtr<Filter2D<float, Cast<float, float>, FilterVec_32f> >
   4513             (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta));
   4514     if( sdepth == CV_64F && ddepth == CV_64F )
   4515         return makePtr<Filter2D<double,
   4516             Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
   4517 
   4518     CV_Error_( CV_StsNotImplemented,
   4519         ("Unsupported combination of source format (=%d), and destination format (=%d)",
   4520         srcType, dstType));
   4521 
   4522     return Ptr<BaseFilter>();
   4523 }
   4524 
   4525 
   4526 cv::Ptr<cv::FilterEngine> cv::createLinearFilter( int _srcType, int _dstType,
   4527                                               InputArray filter_kernel,
   4528                                               Point _anchor, double _delta,
   4529                                               int _rowBorderType, int _columnBorderType,
   4530                                               const Scalar& _borderValue )
   4531 {
   4532     Mat _kernel = filter_kernel.getMat();
   4533     _srcType = CV_MAT_TYPE(_srcType);
   4534     _dstType = CV_MAT_TYPE(_dstType);
   4535     int cn = CV_MAT_CN(_srcType);
   4536     CV_Assert( cn == CV_MAT_CN(_dstType) );
   4537 
   4538     Mat kernel = _kernel;
   4539     int bits = 0;
   4540 
   4541     /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
   4542     int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
   4543     if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
   4544         _kernel.rows*_kernel.cols <= (1 << 10) )
   4545     {
   4546         bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
   4547         _kernel.convertTo(kernel, CV_32S, 1 << bits);
   4548     }*/
   4549 
   4550     Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
   4551         kernel, _anchor, _delta, bits);
   4552 
   4553     return makePtr<FilterEngine>(_filter2D, Ptr<BaseRowFilter>(),
   4554         Ptr<BaseColumnFilter>(), _srcType, _dstType, _srcType,
   4555         _rowBorderType, _columnBorderType, _borderValue );
   4556 }
   4557 
   4558 
   4559 void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
   4560                    InputArray _kernel, Point anchor0,
   4561                    double delta, int borderType )
   4562 {
   4563     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
   4564                ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType))
   4565 
   4566     Mat src = _src.getMat(), kernel = _kernel.getMat();
   4567 
   4568     if( ddepth < 0 )
   4569         ddepth = src.depth();
   4570 
   4571 #if CV_SSE2
   4572     int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
   4573         (src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3)? 130 : 50;
   4574 #else
   4575     int dft_filter_size = 50;
   4576 #endif
   4577 
   4578     _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
   4579     Mat dst = _dst.getMat();
   4580     Point anchor = normalizeAnchor(anchor0, kernel.size());
   4581 
   4582 #if IPP_VERSION_X100 > 0 && !defined HAVE_IPP_ICV_ONLY
   4583     CV_IPP_CHECK()
   4584     {
   4585         typedef IppStatus (CV_STDCALL * ippiFilterBorder)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize dstRoiSize,
   4586                                                           IppiBorderType border, const void * borderValue,
   4587                                                           const IppiFilterBorderSpec* pSpec, Ipp8u* pBuffer);
   4588 
   4589         int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
   4590                 ktype = kernel.type(), kdepth = CV_MAT_DEPTH(ktype);
   4591         bool isolated = (borderType & BORDER_ISOLATED) != 0;
   4592         Point ippAnchor(kernel.cols >> 1, kernel.rows >> 1);
   4593         int borderTypeNI = borderType & ~BORDER_ISOLATED;
   4594         IppiBorderType ippBorderType = ippiGetBorderType(borderTypeNI);
   4595 
   4596         if (borderTypeNI == BORDER_CONSTANT || borderTypeNI == BORDER_REPLICATE)
   4597         {
   4598             ippiFilterBorder ippFunc =
   4599                 stype == CV_8UC1 ? (ippiFilterBorder)ippiFilterBorder_8u_C1R :
   4600                 stype == CV_8UC3 ? (ippiFilterBorder)ippiFilterBorder_8u_C3R :
   4601                 stype == CV_8UC4 ? (ippiFilterBorder)ippiFilterBorder_8u_C4R :
   4602                 stype == CV_16UC1 ? (ippiFilterBorder)ippiFilterBorder_16u_C1R :
   4603                 stype == CV_16UC3 ? (ippiFilterBorder)ippiFilterBorder_16u_C3R :
   4604                 stype == CV_16UC4 ? (ippiFilterBorder)ippiFilterBorder_16u_C4R :
   4605                 stype == CV_16SC1 ? (ippiFilterBorder)ippiFilterBorder_16s_C1R :
   4606                 stype == CV_16SC3 ? (ippiFilterBorder)ippiFilterBorder_16s_C3R :
   4607                 stype == CV_16SC4 ? (ippiFilterBorder)ippiFilterBorder_16s_C4R :
   4608                 stype == CV_32FC1 ? (ippiFilterBorder)ippiFilterBorder_32f_C1R :
   4609                 stype == CV_32FC3 ? (ippiFilterBorder)ippiFilterBorder_32f_C3R :
   4610                 stype == CV_32FC4 ? (ippiFilterBorder)ippiFilterBorder_32f_C4R : 0;
   4611 
   4612             if (sdepth == ddepth && (ktype == CV_16SC1 || ktype == CV_32FC1) &&
   4613                     ippFunc && (int)ippBorderType >= 0 && (!src.isSubmatrix() || isolated) &&
   4614                     std::fabs(delta - 0) < DBL_EPSILON && ippAnchor == anchor && dst.data != src.data)
   4615             {
   4616                 IppiSize kernelSize = { kernel.cols, kernel.rows }, dstRoiSize = { dst.cols, dst.rows };
   4617                 IppDataType dataType = ippiGetDataType(ddepth), kernelType = ippiGetDataType(kdepth);
   4618                 Ipp32s specSize = 0, bufsize = 0;
   4619                 IppStatus status = (IppStatus)-1;
   4620 
   4621                 if ((status = ippiFilterBorderGetSize(kernelSize, dstRoiSize, dataType, kernelType, cn, &specSize, &bufsize)) >= 0)
   4622                 {
   4623                     IppiFilterBorderSpec * spec = (IppiFilterBorderSpec *)ippMalloc(specSize);
   4624                     Ipp8u * buffer = ippsMalloc_8u(bufsize);
   4625                     Ipp32f borderValue[4] = { 0, 0, 0, 0 };
   4626 
   4627                     Mat reversedKernel;
   4628                     flip(kernel, reversedKernel, -1);
   4629 
   4630                     if ((kdepth == CV_32F && (status = ippiFilterBorderInit_32f((const Ipp32f *)reversedKernel.data, kernelSize,
   4631                             dataType, cn, ippRndFinancial, spec)) >= 0 ) ||
   4632                         (kdepth == CV_16S && (status = ippiFilterBorderInit_16s((const Ipp16s *)reversedKernel.data,
   4633                             kernelSize, 0, dataType, cn, ippRndFinancial, spec)) >= 0))
   4634                     {
   4635                         status = ippFunc(src.data, (int)src.step, dst.data, (int)dst.step, dstRoiSize,
   4636                                          ippBorderType, borderValue, spec, buffer);
   4637                     }
   4638 
   4639                     ippsFree(buffer);
   4640                     ippsFree(spec);
   4641                 }
   4642 
   4643                 if (status >= 0)
   4644                 {
   4645                     CV_IMPL_ADD(CV_IMPL_IPP);
   4646                     return;
   4647                 }
   4648                 setIppErrorStatus();
   4649             }
   4650         }
   4651     }
   4652 #endif
   4653 
   4654 #ifdef HAVE_TEGRA_OPTIMIZATION
   4655     if( tegra::useTegra() && tegra::filter2D(src, dst, kernel, anchor, delta, borderType) )
   4656         return;
   4657 #endif
   4658 
   4659     if( kernel.cols*kernel.rows >= dft_filter_size )
   4660     {
   4661         Mat temp;
   4662         // crossCorr doesn't accept non-zero delta with multiple channels
   4663         if( src.channels() != 1 && delta != 0 )
   4664         {
   4665             // The semantics of filter2D require that the delta be applied
   4666             // as floating-point math.  So wee need an intermediate Mat
   4667             // with a float datatype.  If the dest is already floats,
   4668             // we just use that.
   4669             int corrDepth = dst.depth();
   4670             if( (dst.depth() == CV_32F || dst.depth() == CV_64F) &&
   4671                 src.data != dst.data )
   4672             {
   4673                 temp = dst;
   4674             }
   4675             else
   4676             {
   4677                 corrDepth = dst.depth() == CV_64F ? CV_64F : CV_32F;
   4678                 temp.create( dst.size(), CV_MAKETYPE(corrDepth, dst.channels()) );
   4679             }
   4680             crossCorr( src, kernel, temp, src.size(),
   4681                        CV_MAKETYPE(corrDepth, src.channels()),
   4682                        anchor, 0, borderType );
   4683             add( temp, delta, temp );
   4684             if ( temp.data != dst.data )
   4685             {
   4686                 temp.convertTo( dst, dst.type() );
   4687             }
   4688         }
   4689         else
   4690         {
   4691             if( src.data != dst.data )
   4692                 temp = dst;
   4693             else
   4694                 temp.create(dst.size(), dst.type());
   4695             crossCorr( src, kernel, temp, src.size(),
   4696                        CV_MAKETYPE(ddepth, src.channels()),
   4697                        anchor, delta, borderType );
   4698             if( temp.data != dst.data )
   4699                 temp.copyTo(dst);
   4700         }
   4701         return;
   4702     }
   4703 
   4704     Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
   4705                                              anchor, delta, borderType & ~BORDER_ISOLATED );
   4706     f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
   4707 }
   4708 
   4709 
   4710 void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
   4711                       InputArray _kernelX, InputArray _kernelY, Point anchor,
   4712                       double delta, int borderType )
   4713 {
   4714     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
   4715                ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
   4716 
   4717     Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
   4718 
   4719     if( ddepth < 0 )
   4720         ddepth = src.depth();
   4721 
   4722     _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
   4723     Mat dst = _dst.getMat();
   4724 
   4725     Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
   4726         dst.type(), kernelX, kernelY, anchor, delta, borderType & ~BORDER_ISOLATED );
   4727     f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
   4728 }
   4729 
   4730 
   4731 CV_IMPL void
   4732 cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
   4733 {
   4734     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
   4735     cv::Mat kernel = cv::cvarrToMat(_kernel);
   4736 
   4737     CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
   4738 
   4739     cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );
   4740 }
   4741 
   4742 /* End of file. */
   4743