Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 #include "precomp.hpp"
     45 #include <climits>
     46 #include <limits>
     47 
     48 #include "opencl_kernels_core.hpp"
     49 
     50 namespace cv
     51 {
     52 
     53 template<typename T> static inline Scalar rawToScalar(const T& v)
     54 {
     55     Scalar s;
     56     typedef typename DataType<T>::channel_type T1;
     57     int i, n = DataType<T>::channels;
     58     for( i = 0; i < n; i++ )
     59         s.val[i] = ((T1*)&v)[i];
     60     return s;
     61 }
     62 
     63 /****************************************************************************************\
     64 *                                        sum                                             *
     65 \****************************************************************************************/
     66 
     67 template <typename T, typename ST>
     68 struct Sum_SIMD
     69 {
     70     int operator () (const T *, const uchar *, ST *, int, int) const
     71     {
     72         return 0;
     73     }
     74 };
     75 
     76 #if CV_SSE2
     77 
     78 template <>
     79 struct Sum_SIMD<schar, int>
     80 {
     81     int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
     82     {
     83         if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
     84             return 0;
     85 
     86         int x = 0;
     87         __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero;
     88 
     89         for ( ; x <= len - 16; x += 16)
     90         {
     91             __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
     92             __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
     93 
     94             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
     95             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
     96 
     97             v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
     98             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
     99             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
    100         }
    101 
    102         for ( ; x <= len - 8; x += 8)
    103         {
    104             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
    105 
    106             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
    107             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
    108         }
    109 
    110         int CV_DECL_ALIGNED(16) ar[4];
    111         _mm_store_si128((__m128i*)ar, v_sum);
    112 
    113         for (int i = 0; i < 4; i += cn)
    114             for (int j = 0; j < cn; ++j)
    115                 dst[j] += ar[j + i];
    116 
    117         return x / cn;
    118     }
    119 };
    120 
    121 template <>
    122 struct Sum_SIMD<int, double>
    123 {
    124     int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const
    125     {
    126         if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
    127             return 0;
    128 
    129         int x = 0;
    130         __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
    131 
    132         for ( ; x <= len - 4; x += 4)
    133         {
    134             __m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x));
    135             v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src));
    136             v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8)));
    137         }
    138 
    139         double CV_DECL_ALIGNED(16) ar[4];
    140         _mm_store_pd(ar, v_sum0);
    141         _mm_store_pd(ar + 2, v_sum1);
    142 
    143         for (int i = 0; i < 4; i += cn)
    144             for (int j = 0; j < cn; ++j)
    145                 dst[j] += ar[j + i];
    146 
    147         return x / cn;
    148     }
    149 };
    150 
    151 template <>
    152 struct Sum_SIMD<float, double>
    153 {
    154     int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const
    155     {
    156         if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
    157             return 0;
    158 
    159         int x = 0;
    160         __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
    161 
    162         for ( ; x <= len - 4; x += 4)
    163         {
    164             __m128 v_src = _mm_loadu_ps(src0 + x);
    165             v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src));
    166             v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
    167             v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src));
    168         }
    169 
    170         double CV_DECL_ALIGNED(16) ar[4];
    171         _mm_store_pd(ar, v_sum0);
    172         _mm_store_pd(ar + 2, v_sum1);
    173 
    174         for (int i = 0; i < 4; i += cn)
    175             for (int j = 0; j < cn; ++j)
    176                 dst[j] += ar[j + i];
    177 
    178         return x / cn;
    179     }
    180 };
    181 
    182 
    183 #elif CV_NEON
    184 
    185 template <>
    186 struct Sum_SIMD<uchar, int>
    187 {
    188     int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const
    189     {
    190         if (mask || (cn != 1 && cn != 2 && cn != 4))
    191             return 0;
    192 
    193         int x = 0;
    194         uint32x4_t v_sum = vdupq_n_u32(0u);
    195 
    196         for ( ; x <= len - 16; x += 16)
    197         {
    198             uint8x16_t v_src = vld1q_u8(src0 + x);
    199             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
    200 
    201             v_sum = vaddw_u16(v_sum, vget_low_u16(v_half));
    202             v_sum = vaddw_u16(v_sum, vget_high_u16(v_half));
    203 
    204             v_half = vmovl_u8(vget_high_u8(v_src));
    205             v_sum = vaddw_u16(v_sum, vget_low_u16(v_half));
    206             v_sum = vaddw_u16(v_sum, vget_high_u16(v_half));
    207         }
    208 
    209         for ( ; x <= len - 8; x += 8)
    210         {
    211             uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x));
    212 
    213             v_sum = vaddw_u16(v_sum, vget_low_u16(v_src));
    214             v_sum = vaddw_u16(v_sum, vget_high_u16(v_src));
    215         }
    216 
    217         unsigned int CV_DECL_ALIGNED(16) ar[4];
    218         vst1q_u32(ar, v_sum);
    219 
    220         for (int i = 0; i < 4; i += cn)
    221             for (int j = 0; j < cn; ++j)
    222                 dst[j] += ar[j + i];
    223 
    224         return x / cn;
    225     }
    226 };
    227 
    228 template <>
    229 struct Sum_SIMD<schar, int>
    230 {
    231     int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
    232     {
    233         if (mask || (cn != 1 && cn != 2 && cn != 4))
    234             return 0;
    235 
    236         int x = 0;
    237         int32x4_t v_sum = vdupq_n_s32(0);
    238 
    239         for ( ; x <= len - 16; x += 16)
    240         {
    241             int8x16_t v_src = vld1q_s8(src0 + x);
    242             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
    243 
    244             v_sum = vaddw_s16(v_sum, vget_low_s16(v_half));
    245             v_sum = vaddw_s16(v_sum, vget_high_s16(v_half));
    246 
    247             v_half = vmovl_s8(vget_high_s8(v_src));
    248             v_sum = vaddw_s16(v_sum, vget_low_s16(v_half));
    249             v_sum = vaddw_s16(v_sum, vget_high_s16(v_half));
    250         }
    251 
    252         for ( ; x <= len - 8; x += 8)
    253         {
    254             int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x));
    255 
    256             v_sum = vaddw_s16(v_sum, vget_low_s16(v_src));
    257             v_sum = vaddw_s16(v_sum, vget_high_s16(v_src));
    258         }
    259 
    260         int CV_DECL_ALIGNED(16) ar[4];
    261         vst1q_s32(ar, v_sum);
    262 
    263         for (int i = 0; i < 4; i += cn)
    264             for (int j = 0; j < cn; ++j)
    265                 dst[j] += ar[j + i];
    266 
    267         return x / cn;
    268     }
    269 };
    270 
    271 template <>
    272 struct Sum_SIMD<ushort, int>
    273 {
    274     int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const
    275     {
    276         if (mask || (cn != 1 && cn != 2 && cn != 4))
    277             return 0;
    278 
    279         int x = 0;
    280         uint32x4_t v_sum = vdupq_n_u32(0u);
    281 
    282         for ( ; x <= len - 8; x += 8)
    283         {
    284             uint16x8_t v_src = vld1q_u16(src0 + x);
    285 
    286             v_sum = vaddw_u16(v_sum, vget_low_u16(v_src));
    287             v_sum = vaddw_u16(v_sum, vget_high_u16(v_src));
    288         }
    289 
    290         for ( ; x <= len - 4; x += 4)
    291             v_sum = vaddw_u16(v_sum, vld1_u16(src0 + x));
    292 
    293         unsigned int CV_DECL_ALIGNED(16) ar[4];
    294         vst1q_u32(ar, v_sum);
    295 
    296         for (int i = 0; i < 4; i += cn)
    297             for (int j = 0; j < cn; ++j)
    298                 dst[j] += ar[j + i];
    299 
    300         return x / cn;
    301     }
    302 };
    303 
    304 template <>
    305 struct Sum_SIMD<short, int>
    306 {
    307     int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const
    308     {
    309         if (mask || (cn != 1 && cn != 2 && cn != 4))
    310             return 0;
    311 
    312         int x = 0;
    313         int32x4_t v_sum = vdupq_n_s32(0u);
    314 
    315         for ( ; x <= len - 8; x += 8)
    316         {
    317             int16x8_t v_src = vld1q_s16(src0 + x);
    318 
    319             v_sum = vaddw_s16(v_sum, vget_low_s16(v_src));
    320             v_sum = vaddw_s16(v_sum, vget_high_s16(v_src));
    321         }
    322 
    323         for ( ; x <= len - 4; x += 4)
    324             v_sum = vaddw_s16(v_sum, vld1_s16(src0 + x));
    325 
    326         int CV_DECL_ALIGNED(16) ar[4];
    327         vst1q_s32(ar, v_sum);
    328 
    329         for (int i = 0; i < 4; i += cn)
    330             for (int j = 0; j < cn; ++j)
    331                 dst[j] += ar[j + i];
    332 
    333         return x / cn;
    334     }
    335 };
    336 
    337 #endif
    338 
    339 template<typename T, typename ST>
    340 static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
    341 {
    342     const T* src = src0;
    343     if( !mask )
    344     {
    345         Sum_SIMD<T, ST> vop;
    346         int i = vop(src0, mask, dst, len, cn), k = cn % 4;
    347         src += i * cn;
    348 
    349         if( k == 1 )
    350         {
    351             ST s0 = dst[0];
    352 
    353             #if CV_ENABLE_UNROLLED
    354             for(; i <= len - 4; i += 4, src += cn*4 )
    355                 s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
    356             #endif
    357             for( ; i < len; i++, src += cn )
    358                 s0 += src[0];
    359             dst[0] = s0;
    360         }
    361         else if( k == 2 )
    362         {
    363             ST s0 = dst[0], s1 = dst[1];
    364             for( ; i < len; i++, src += cn )
    365             {
    366                 s0 += src[0];
    367                 s1 += src[1];
    368             }
    369             dst[0] = s0;
    370             dst[1] = s1;
    371         }
    372         else if( k == 3 )
    373         {
    374             ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
    375             for( ; i < len; i++, src += cn )
    376             {
    377                 s0 += src[0];
    378                 s1 += src[1];
    379                 s2 += src[2];
    380             }
    381             dst[0] = s0;
    382             dst[1] = s1;
    383             dst[2] = s2;
    384         }
    385 
    386         for( ; k < cn; k += 4 )
    387         {
    388             src = src0 + i*cn + k;
    389             ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3];
    390             for( ; i < len; i++, src += cn )
    391             {
    392                 s0 += src[0]; s1 += src[1];
    393                 s2 += src[2]; s3 += src[3];
    394             }
    395             dst[k] = s0;
    396             dst[k+1] = s1;
    397             dst[k+2] = s2;
    398             dst[k+3] = s3;
    399         }
    400         return len;
    401     }
    402 
    403     int i, nzm = 0;
    404     if( cn == 1 )
    405     {
    406         ST s = dst[0];
    407         for( i = 0; i < len; i++ )
    408             if( mask[i] )
    409             {
    410                 s += src[i];
    411                 nzm++;
    412             }
    413         dst[0] = s;
    414     }
    415     else if( cn == 3 )
    416     {
    417         ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
    418         for( i = 0; i < len; i++, src += 3 )
    419             if( mask[i] )
    420             {
    421                 s0 += src[0];
    422                 s1 += src[1];
    423                 s2 += src[2];
    424                 nzm++;
    425             }
    426         dst[0] = s0;
    427         dst[1] = s1;
    428         dst[2] = s2;
    429     }
    430     else
    431     {
    432         for( i = 0; i < len; i++, src += cn )
    433             if( mask[i] )
    434             {
    435                 int k = 0;
    436                 #if CV_ENABLE_UNROLLED
    437                 for( ; k <= cn - 4; k += 4 )
    438                 {
    439                     ST s0, s1;
    440                     s0 = dst[k] + src[k];
    441                     s1 = dst[k+1] + src[k+1];
    442                     dst[k] = s0; dst[k+1] = s1;
    443                     s0 = dst[k+2] + src[k+2];
    444                     s1 = dst[k+3] + src[k+3];
    445                     dst[k+2] = s0; dst[k+3] = s1;
    446                 }
    447                 #endif
    448                 for( ; k < cn; k++ )
    449                     dst[k] += src[k];
    450                 nzm++;
    451             }
    452     }
    453     return nzm;
    454 }
    455 
    456 
    457 static int sum8u( const uchar* src, const uchar* mask, int* dst, int len, int cn )
    458 { return sum_(src, mask, dst, len, cn); }
    459 
    460 static int sum8s( const schar* src, const uchar* mask, int* dst, int len, int cn )
    461 { return sum_(src, mask, dst, len, cn); }
    462 
    463 static int sum16u( const ushort* src, const uchar* mask, int* dst, int len, int cn )
    464 { return sum_(src, mask, dst, len, cn); }
    465 
    466 static int sum16s( const short* src, const uchar* mask, int* dst, int len, int cn )
    467 { return sum_(src, mask, dst, len, cn); }
    468 
    469 static int sum32s( const int* src, const uchar* mask, double* dst, int len, int cn )
    470 { return sum_(src, mask, dst, len, cn); }
    471 
    472 static int sum32f( const float* src, const uchar* mask, double* dst, int len, int cn )
    473 { return sum_(src, mask, dst, len, cn); }
    474 
    475 static int sum64f( const double* src, const uchar* mask, double* dst, int len, int cn )
    476 { return sum_(src, mask, dst, len, cn); }
    477 
    478 typedef int (*SumFunc)(const uchar*, const uchar* mask, uchar*, int, int);
    479 
    480 static SumFunc getSumFunc(int depth)
    481 {
    482     static SumFunc sumTab[] =
    483     {
    484         (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s,
    485         (SumFunc)sum16u, (SumFunc)sum16s,
    486         (SumFunc)sum32s,
    487         (SumFunc)GET_OPTIMIZED(sum32f), (SumFunc)sum64f,
    488         0
    489     };
    490 
    491     return sumTab[depth];
    492 }
    493 
    494 template<typename T>
    495 static int countNonZero_(const T* src, int len )
    496 {
    497     int i=0, nz = 0;
    498     #if CV_ENABLE_UNROLLED
    499     for(; i <= len - 4; i += 4 )
    500         nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
    501     #endif
    502     for( ; i < len; i++ )
    503         nz += src[i] != 0;
    504     return nz;
    505 }
    506 
    507 static int countNonZero8u( const uchar* src, int len )
    508 {
    509     int i=0, nz = 0;
    510 #if CV_SSE2
    511     if(USE_SSE2)//5x-6x
    512     {
    513         __m128i v_zero = _mm_setzero_si128();
    514         __m128i sum = _mm_setzero_si128();
    515 
    516         for (; i<=len-16; i+=16)
    517         {
    518             __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i));
    519             sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi8(r0, v_zero)), v_zero));
    520         }
    521         nz = i - _mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum)));
    522     }
    523 #elif CV_NEON
    524     int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6;
    525     uint32x4_t v_nz = vdupq_n_u32(0u);
    526     uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1);
    527     const uchar * src0 = src;
    528 
    529     while( i < len0 )
    530     {
    531         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
    532 
    533         while (j < blockSizei)
    534         {
    535             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
    536             uint8x16_t v_pz = v_zero;
    537 
    538             for( ; k <= blockSizej - 16; k += 16 )
    539                 v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1));
    540 
    541             uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz));
    542             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz);
    543             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz);
    544 
    545             src0 += blockSizej;
    546             j += blockSizej;
    547         }
    548 
    549         i += blockSizei;
    550     }
    551 
    552     CV_DECL_ALIGNED(16) unsigned int buf[4];
    553     vst1q_u32(buf, v_nz);
    554     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
    555 #endif
    556     for( ; i < len; i++ )
    557         nz += src[i] != 0;
    558     return nz;
    559 }
    560 
    561 static int countNonZero16u( const ushort* src, int len )
    562 {
    563     int i = 0, nz = 0;
    564 #if CV_SSE2
    565     if (USE_SSE2)
    566     {
    567         __m128i v_zero = _mm_setzero_si128 ();
    568         __m128i sum = _mm_setzero_si128();
    569 
    570         for ( ; i <= len - 8; i += 8)
    571         {
    572             __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i));
    573             sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi16(r0, v_zero)), v_zero));
    574         }
    575 
    576         nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 1);
    577         src += i;
    578     }
    579 #elif CV_NEON
    580     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
    581     uint32x4_t v_nz = vdupq_n_u32(0u);
    582     uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);
    583 
    584     while( i < len0 )
    585     {
    586         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
    587 
    588         while (j < blockSizei)
    589         {
    590             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
    591             uint16x8_t v_pz = v_zero;
    592 
    593             for( ; k <= blockSizej - 8; k += 8 )
    594                 v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1));
    595 
    596             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
    597 
    598             src += blockSizej;
    599             j += blockSizej;
    600         }
    601 
    602         i += blockSizei;
    603     }
    604 
    605     CV_DECL_ALIGNED(16) unsigned int buf[4];
    606     vst1q_u32(buf, v_nz);
    607     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
    608 #endif
    609     return nz + countNonZero_(src, len - i);
    610 }
    611 
    612 static int countNonZero32s( const int* src, int len )
    613 {
    614     int i = 0, nz = 0;
    615 #if CV_SSE2
    616     if (USE_SSE2)
    617     {
    618         __m128i v_zero = _mm_setzero_si128 ();
    619         __m128i sum = _mm_setzero_si128();
    620 
    621         for ( ; i <= len - 4; i += 4)
    622         {
    623             __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i));
    624             sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi32(r0, v_zero)), v_zero));
    625         }
    626 
    627         nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2);
    628         src += i;
    629     }
    630 #elif CV_NEON
    631     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
    632     uint32x4_t v_nz = vdupq_n_u32(0u);
    633     int32x4_t v_zero = vdupq_n_s32(0.0f);
    634     uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
    635 
    636     while( i < len0 )
    637     {
    638         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
    639 
    640         while (j < blockSizei)
    641         {
    642             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
    643             uint16x8_t v_pz = v_zerou;
    644 
    645             for( ; k <= blockSizej - 8; k += 8 )
    646                 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)),
    647                                                               vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1));
    648 
    649             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
    650 
    651             src += blockSizej;
    652             j += blockSizej;
    653         }
    654 
    655         i += blockSizei;
    656     }
    657 
    658     CV_DECL_ALIGNED(16) unsigned int buf[4];
    659     vst1q_u32(buf, v_nz);
    660     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
    661 #endif
    662     return nz + countNonZero_(src, len - i);
    663 }
    664 
    665 static int countNonZero32f( const float* src, int len )
    666 {
    667     int i = 0, nz = 0;
    668 #if CV_SSE2
    669     if (USE_SSE2)
    670     {
    671         __m128 v_zero_f = _mm_setzero_ps();
    672         __m128i v_zero = _mm_setzero_si128 ();
    673         __m128i sum = _mm_setzero_si128();
    674 
    675         for ( ; i <= len - 4; i += 4)
    676         {
    677             __m128 r0 = _mm_loadu_ps(src + i);
    678             sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_castps_si128(_mm_cmpeq_ps(r0, v_zero_f))), v_zero));
    679         }
    680 
    681         nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2);
    682         src += i;
    683     }
    684 #elif CV_NEON
    685     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
    686     uint32x4_t v_nz = vdupq_n_u32(0u);
    687     float32x4_t v_zero = vdupq_n_f32(0.0f);
    688     uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
    689 
    690     while( i < len0 )
    691     {
    692         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
    693 
    694         while (j < blockSizei)
    695         {
    696             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
    697             uint16x8_t v_pz = v_zerou;
    698 
    699             for( ; k <= blockSizej - 8; k += 8 )
    700                 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)),
    701                                                               vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1));
    702 
    703             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
    704 
    705             src += blockSizej;
    706             j += blockSizej;
    707         }
    708 
    709         i += blockSizei;
    710     }
    711 
    712     CV_DECL_ALIGNED(16) unsigned int buf[4];
    713     vst1q_u32(buf, v_nz);
    714     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
    715 #endif
    716     return nz + countNonZero_(src, len - i);
    717 }
    718 
    719 static int countNonZero64f( const double* src, int len )
    720 {
    721     return countNonZero_(src, len);
    722 }
    723 
    724 typedef int (*CountNonZeroFunc)(const uchar*, int);
    725 
    726 static CountNonZeroFunc getCountNonZeroTab(int depth)
    727 {
    728     static CountNonZeroFunc countNonZeroTab[] =
    729     {
    730         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u),
    731         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u),
    732         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f),
    733         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0
    734     };
    735 
    736     return countNonZeroTab[depth];
    737 }
    738 
    739 template <typename T, typename ST, typename SQT>
    740 struct SumSqr_SIMD
    741 {
    742     int operator () (const T *, const uchar *, ST *, SQT *, int, int) const
    743     {
    744         return 0;
    745     }
    746 };
    747 
    748 #if CV_SSE2
    749 
    750 template <>
    751 struct SumSqr_SIMD<uchar, int, int>
    752 {
    753     int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
    754     {
    755         if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
    756             return 0;
    757 
    758         int x = 0;
    759         __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
    760 
    761         for ( ; x <= len - 16; x += 16)
    762         {
    763             __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
    764             __m128i v_half = _mm_unpacklo_epi8(v_src, v_zero);
    765 
    766             __m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
    767             __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
    768             v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
    769             v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
    770             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
    771             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
    772 
    773             v_half = _mm_unpackhi_epi8(v_src, v_zero);
    774             v_mullo = _mm_mullo_epi16(v_half, v_half);
    775             v_mulhi = _mm_mulhi_epi16(v_half, v_half);
    776             v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
    777             v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
    778             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
    779             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
    780         }
    781 
    782         for ( ; x <= len - 8; x += 8)
    783         {
    784             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero);
    785 
    786             __m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
    787             __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
    788             v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_src, v_zero));
    789             v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_src, v_zero));
    790             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
    791             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
    792         }
    793 
    794         int CV_DECL_ALIGNED(16) ar[8];
    795         _mm_store_si128((__m128i*)ar, v_sum);
    796         _mm_store_si128((__m128i*)(ar + 4), v_sqsum);
    797 
    798         for (int i = 0; i < 4; i += cn)
    799             for (int j = 0; j < cn; ++j)
    800             {
    801                 sum[j] += ar[j + i];
    802                 sqsum[j] += ar[4 + j + i];
    803             }
    804 
    805         return x / cn;
    806     }
    807 };
    808 
    809 template <>
    810 struct SumSqr_SIMD<schar, int, int>
    811 {
    812     int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
    813     {
    814         if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
    815             return 0;
    816 
    817         int x = 0;
    818         __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
    819 
    820         for ( ; x <= len - 16; x += 16)
    821         {
    822             __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
    823             __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
    824 
    825             __m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
    826             __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
    827             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
    828             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
    829             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
    830             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
    831 
    832             v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
    833             v_mullo = _mm_mullo_epi16(v_half, v_half);
    834             v_mulhi = _mm_mulhi_epi16(v_half, v_half);
    835             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
    836             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
    837             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
    838             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
    839         }
    840 
    841         for ( ; x <= len - 8; x += 8)
    842         {
    843             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
    844 
    845             __m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
    846             __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
    847             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
    848             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
    849             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
    850             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
    851         }
    852 
    853         int CV_DECL_ALIGNED(16) ar[8];
    854         _mm_store_si128((__m128i*)ar, v_sum);
    855         _mm_store_si128((__m128i*)(ar + 4), v_sqsum);
    856 
    857         for (int i = 0; i < 4; i += cn)
    858             for (int j = 0; j < cn; ++j)
    859             {
    860                 sum[j] += ar[j + i];
    861                 sqsum[j] += ar[4 + j + i];
    862             }
    863 
    864         return x / cn;
    865     }
    866 };
    867 
    868 #endif
    869 
    870 template<typename T, typename ST, typename SQT>
    871 static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
    872 {
    873     const T* src = src0;
    874 
    875     if( !mask )
    876     {
    877         SumSqr_SIMD<T, ST, SQT> vop;
    878         int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4;
    879         src += i * cn;
    880 
    881         if( k == 1 )
    882         {
    883             ST s0 = sum[0];
    884             SQT sq0 = sqsum[0];
    885             for( ; i < len; i++, src += cn )
    886             {
    887                 T v = src[0];
    888                 s0 += v; sq0 += (SQT)v*v;
    889             }
    890             sum[0] = s0;
    891             sqsum[0] = sq0;
    892         }
    893         else if( k == 2 )
    894         {
    895             ST s0 = sum[0], s1 = sum[1];
    896             SQT sq0 = sqsum[0], sq1 = sqsum[1];
    897             for( ; i < len; i++, src += cn )
    898             {
    899                 T v0 = src[0], v1 = src[1];
    900                 s0 += v0; sq0 += (SQT)v0*v0;
    901                 s1 += v1; sq1 += (SQT)v1*v1;
    902             }
    903             sum[0] = s0; sum[1] = s1;
    904             sqsum[0] = sq0; sqsum[1] = sq1;
    905         }
    906         else if( k == 3 )
    907         {
    908             ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
    909             SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
    910             for( ; i < len; i++, src += cn )
    911             {
    912                 T v0 = src[0], v1 = src[1], v2 = src[2];
    913                 s0 += v0; sq0 += (SQT)v0*v0;
    914                 s1 += v1; sq1 += (SQT)v1*v1;
    915                 s2 += v2; sq2 += (SQT)v2*v2;
    916             }
    917             sum[0] = s0; sum[1] = s1; sum[2] = s2;
    918             sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
    919         }
    920 
    921         for( ; k < cn; k += 4 )
    922         {
    923             src = src0 + k;
    924             ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
    925             SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
    926             for( ; i < len; i++, src += cn )
    927             {
    928                 T v0, v1;
    929                 v0 = src[0], v1 = src[1];
    930                 s0 += v0; sq0 += (SQT)v0*v0;
    931                 s1 += v1; sq1 += (SQT)v1*v1;
    932                 v0 = src[2], v1 = src[3];
    933                 s2 += v0; sq2 += (SQT)v0*v0;
    934                 s3 += v1; sq3 += (SQT)v1*v1;
    935             }
    936             sum[k] = s0; sum[k+1] = s1;
    937             sum[k+2] = s2; sum[k+3] = s3;
    938             sqsum[k] = sq0; sqsum[k+1] = sq1;
    939             sqsum[k+2] = sq2; sqsum[k+3] = sq3;
    940         }
    941         return len;
    942     }
    943 
    944     int i, nzm = 0;
    945 
    946     if( cn == 1 )
    947     {
    948         ST s0 = sum[0];
    949         SQT sq0 = sqsum[0];
    950         for( i = 0; i < len; i++ )
    951             if( mask[i] )
    952             {
    953                 T v = src[i];
    954                 s0 += v; sq0 += (SQT)v*v;
    955                 nzm++;
    956             }
    957         sum[0] = s0;
    958         sqsum[0] = sq0;
    959     }
    960     else if( cn == 3 )
    961     {
    962         ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
    963         SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
    964         for( i = 0; i < len; i++, src += 3 )
    965             if( mask[i] )
    966             {
    967                 T v0 = src[0], v1 = src[1], v2 = src[2];
    968                 s0 += v0; sq0 += (SQT)v0*v0;
    969                 s1 += v1; sq1 += (SQT)v1*v1;
    970                 s2 += v2; sq2 += (SQT)v2*v2;
    971                 nzm++;
    972             }
    973         sum[0] = s0; sum[1] = s1; sum[2] = s2;
    974         sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
    975     }
    976     else
    977     {
    978         for( i = 0; i < len; i++, src += cn )
    979             if( mask[i] )
    980             {
    981                 for( int k = 0; k < cn; k++ )
    982                 {
    983                     T v = src[k];
    984                     ST s = sum[k] + v;
    985                     SQT sq = sqsum[k] + (SQT)v*v;
    986                     sum[k] = s; sqsum[k] = sq;
    987                 }
    988                 nzm++;
    989             }
    990     }
    991     return nzm;
    992 }
    993 
    994 
    995 static int sqsum8u( const uchar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
    996 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
    997 
    998 static int sqsum8s( const schar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
    999 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
   1000 
   1001 static int sqsum16u( const ushort* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
   1002 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
   1003 
   1004 static int sqsum16s( const short* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
   1005 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
   1006 
   1007 static int sqsum32s( const int* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
   1008 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
   1009 
   1010 static int sqsum32f( const float* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
   1011 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
   1012 
   1013 static int sqsum64f( const double* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
   1014 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
   1015 
   1016 typedef int (*SumSqrFunc)(const uchar*, const uchar* mask, uchar*, uchar*, int, int);
   1017 
   1018 static SumSqrFunc getSumSqrTab(int depth)
   1019 {
   1020     static SumSqrFunc sumSqrTab[] =
   1021     {
   1022         (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s,
   1023         (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0
   1024     };
   1025 
   1026     return sumSqrTab[depth];
   1027 }
   1028 
   1029 #ifdef HAVE_OPENCL
   1030 
   1031 template <typename T> Scalar ocl_part_sum(Mat m)
   1032 {
   1033     CV_Assert(m.rows == 1);
   1034 
   1035     Scalar s = Scalar::all(0);
   1036     int cn = m.channels();
   1037     const T * const ptr = m.ptr<T>(0);
   1038 
   1039     for (int x = 0, w = m.cols * cn; x < w; )
   1040         for (int c = 0; c < cn; ++c, ++x)
   1041             s[c] += ptr[x];
   1042 
   1043     return s;
   1044 }
   1045 
   1046 enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
   1047 
   1048 static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(),
   1049                      InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() )
   1050 {
   1051     CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
   1052 
   1053     const ocl::Device & dev = ocl::Device::getDefault();
   1054     bool doubleSupport = dev.doubleFPConfig() > 0,
   1055         haveMask = _mask.kind() != _InputArray::NONE,
   1056         haveSrc2 = _src2.kind() != _InputArray::NONE;
   1057     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
   1058             kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1,
   1059             mcn = std::max(cn, kercn);
   1060     CV_Assert(!haveSrc2 || _src2.type() == type);
   1061     int convert_cn = haveSrc2 ? mcn : cn;
   1062 
   1063     if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
   1064         return false;
   1065 
   1066     int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1);
   1067     size_t wgs = dev.maxWorkGroupSize();
   1068 
   1069     int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
   1070             dtype = CV_MAKE_TYPE(ddepth, cn);
   1071     CV_Assert(!haveMask || _mask.type() == CV_8UC1);
   1072 
   1073     int wgs2_aligned = 1;
   1074     while (wgs2_aligned < (int)wgs)
   1075         wgs2_aligned <<= 1;
   1076     wgs2_aligned >>= 1;
   1077 
   1078     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
   1079     char cvt[2][40];
   1080     String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
   1081                          " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s",
   1082                          ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
   1083                          ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
   1084                          ocl::typeToStr(ddepth), ddepth, cn,
   1085                          ocl::convertTypeStr(depth, ddepth, mcn, cvt[0]),
   1086                          opMap[sum_op], (int)wgs, wgs2_aligned,
   1087                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
   1088                          haveMask ? " -D HAVE_MASK" : "",
   1089                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
   1090                          haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
   1091                          haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
   1092                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
   1093                          depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert");
   1094 
   1095     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
   1096     if (k.empty())
   1097         return false;
   1098 
   1099     UMat src = _src.getUMat(), src2 = _src2.getUMat(),
   1100         db(1, dbsize, dtype), mask = _mask.getUMat();
   1101 
   1102     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
   1103             dbarg = ocl::KernelArg::PtrWriteOnly(db),
   1104             maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
   1105             src2arg = ocl::KernelArg::ReadOnlyNoSize(src2);
   1106 
   1107     if (haveMask)
   1108     {
   1109         if (haveSrc2)
   1110             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg);
   1111         else
   1112             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg);
   1113     }
   1114     else
   1115     {
   1116         if (haveSrc2)
   1117             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg);
   1118         else
   1119             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg);
   1120     }
   1121 
   1122     size_t globalsize = ngroups * wgs;
   1123     if (k.run(1, &globalsize, &wgs, false))
   1124     {
   1125         typedef Scalar (*part_sum)(Mat m);
   1126         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
   1127                 func = funcs[ddepth - CV_32S];
   1128 
   1129         Mat mres = db.getMat(ACCESS_READ);
   1130         if (calc2)
   1131             const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize));
   1132 
   1133         res = func(mres.colRange(0, ngroups));
   1134         return true;
   1135     }
   1136     return false;
   1137 }
   1138 
   1139 #endif
   1140 
   1141 }
   1142 
   1143 cv::Scalar cv::sum( InputArray _src )
   1144 {
   1145 #ifdef HAVE_OPENCL
   1146     Scalar _res;
   1147     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
   1148                 ocl_sum(_src, _res, OCL_OP_SUM),
   1149                 _res)
   1150 #endif
   1151 
   1152     Mat src = _src.getMat();
   1153     int k, cn = src.channels(), depth = src.depth();
   1154 
   1155 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   1156     CV_IPP_CHECK()
   1157     {
   1158         size_t total_size = src.total();
   1159         int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
   1160         if( src.dims == 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
   1161         {
   1162             IppiSize sz = { cols, rows };
   1163             int type = src.type();
   1164             typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
   1165             typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
   1166             ippiSumFuncHint ippFuncHint =
   1167                 type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
   1168                 type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
   1169                 type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
   1170                 0;
   1171             ippiSumFuncNoHint ippFuncNoHint =
   1172                 type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
   1173                 type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
   1174                 type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
   1175                 type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
   1176                 type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
   1177                 type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
   1178                 type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
   1179                 type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
   1180                 type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
   1181                 0;
   1182             CV_Assert(!ippFuncHint || !ippFuncNoHint);
   1183             if( ippFuncHint || ippFuncNoHint )
   1184             {
   1185                 Ipp64f res[4];
   1186                 IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
   1187                                 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res);
   1188                 if( ret >= 0 )
   1189                 {
   1190                     Scalar sc;
   1191                     for( int i = 0; i < cn; i++ )
   1192                         sc[i] = res[i];
   1193                     CV_IMPL_ADD(CV_IMPL_IPP);
   1194                     return sc;
   1195                 }
   1196                 setIppErrorStatus();
   1197             }
   1198         }
   1199     }
   1200 #endif
   1201     SumFunc func = getSumFunc(depth);
   1202 
   1203     CV_Assert( cn <= 4 && func != 0 );
   1204 
   1205     const Mat* arrays[] = {&src, 0};
   1206     uchar* ptrs[1];
   1207     NAryMatIterator it(arrays, ptrs);
   1208     Scalar s;
   1209     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
   1210     int j, count = 0;
   1211     AutoBuffer<int> _buf;
   1212     int* buf = (int*)&s[0];
   1213     size_t esz = 0;
   1214     bool blockSum = depth < CV_32S;
   1215 
   1216     if( blockSum )
   1217     {
   1218         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
   1219         blockSize = std::min(blockSize, intSumBlockSize);
   1220         _buf.allocate(cn);
   1221         buf = _buf;
   1222 
   1223         for( k = 0; k < cn; k++ )
   1224             buf[k] = 0;
   1225         esz = src.elemSize();
   1226     }
   1227 
   1228     for( size_t i = 0; i < it.nplanes; i++, ++it )
   1229     {
   1230         for( j = 0; j < total; j += blockSize )
   1231         {
   1232             int bsz = std::min(total - j, blockSize);
   1233             func( ptrs[0], 0, (uchar*)buf, bsz, cn );
   1234             count += bsz;
   1235             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
   1236             {
   1237                 for( k = 0; k < cn; k++ )
   1238                 {
   1239                     s[k] += buf[k];
   1240                     buf[k] = 0;
   1241                 }
   1242                 count = 0;
   1243             }
   1244             ptrs[0] += bsz*esz;
   1245         }
   1246     }
   1247     return s;
   1248 }
   1249 
   1250 #ifdef HAVE_OPENCL
   1251 
   1252 namespace cv {
   1253 
   1254 static bool ocl_countNonZero( InputArray _src, int & res )
   1255 {
   1256     int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
   1257     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
   1258 
   1259     if (depth == CV_64F && !doubleSupport)
   1260         return false;
   1261 
   1262     int dbsize = ocl::Device::getDefault().maxComputeUnits();
   1263     size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
   1264 
   1265     int wgs2_aligned = 1;
   1266     while (wgs2_aligned < (int)wgs)
   1267         wgs2_aligned <<= 1;
   1268     wgs2_aligned >>= 1;
   1269 
   1270     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
   1271                   format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
   1272                          " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
   1273                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
   1274                          ocl::typeToStr(depth), (int)wgs, kercn,
   1275                          wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
   1276                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
   1277     if (k.empty())
   1278         return false;
   1279 
   1280     UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
   1281     k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
   1282            dbsize, ocl::KernelArg::PtrWriteOnly(db));
   1283 
   1284     size_t globalsize = dbsize * wgs;
   1285     if (k.run(1, &globalsize, &wgs, true))
   1286         return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
   1287     return false;
   1288 }
   1289 
   1290 }
   1291 
   1292 #endif
   1293 
   1294 int cv::countNonZero( InputArray _src )
   1295 {
   1296     int type = _src.type(), cn = CV_MAT_CN(type);
   1297     CV_Assert( cn == 1 );
   1298 
   1299 #ifdef HAVE_OPENCL
   1300     int res = -1;
   1301     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
   1302                 ocl_countNonZero(_src, res),
   1303                 res)
   1304 #endif
   1305 
   1306     Mat src = _src.getMat();
   1307 
   1308 #if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && 0
   1309     CV_IPP_CHECK()
   1310     {
   1311         if (src.dims <= 2 || src.isContinuous())
   1312         {
   1313             IppiSize roiSize = { src.cols, src.rows };
   1314             Ipp32s count = 0, srcstep = (Ipp32s)src.step;
   1315             IppStatus status = (IppStatus)-1;
   1316 
   1317             if (src.isContinuous())
   1318             {
   1319                 roiSize.width = (Ipp32s)src.total();
   1320                 roiSize.height = 1;
   1321                 srcstep = (Ipp32s)src.total() * CV_ELEM_SIZE(type);
   1322             }
   1323 
   1324             int depth = CV_MAT_DEPTH(type);
   1325             if (depth == CV_8U)
   1326                 status = ippiCountInRange_8u_C1R((const Ipp8u *)src.data, srcstep, roiSize, &count, 0, 0);
   1327             else if (depth == CV_32F)
   1328                 status = ippiCountInRange_32f_C1R((const Ipp32f *)src.data, srcstep, roiSize, &count, 0, 0);
   1329 
   1330             if (status >= 0)
   1331             {
   1332                 CV_IMPL_ADD(CV_IMPL_IPP);
   1333                 return (Ipp32s)src.total() - count;
   1334             }
   1335             setIppErrorStatus();
   1336         }
   1337     }
   1338 #endif
   1339 
   1340     CountNonZeroFunc func = getCountNonZeroTab(src.depth());
   1341     CV_Assert( func != 0 );
   1342 
   1343     const Mat* arrays[] = {&src, 0};
   1344     uchar* ptrs[1];
   1345     NAryMatIterator it(arrays, ptrs);
   1346     int total = (int)it.size, nz = 0;
   1347 
   1348     for( size_t i = 0; i < it.nplanes; i++, ++it )
   1349         nz += func( ptrs[0], total );
   1350 
   1351     return nz;
   1352 }
   1353 
   1354 cv::Scalar cv::mean( InputArray _src, InputArray _mask )
   1355 {
   1356     Mat src = _src.getMat(), mask = _mask.getMat();
   1357     CV_Assert( mask.empty() || mask.type() == CV_8U );
   1358 
   1359     int k, cn = src.channels(), depth = src.depth();
   1360 
   1361 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   1362     CV_IPP_CHECK()
   1363     {
   1364         size_t total_size = src.total();
   1365         int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
   1366         if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
   1367         {
   1368             IppiSize sz = { cols, rows };
   1369             int type = src.type();
   1370             if( !mask.empty() )
   1371             {
   1372                 typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
   1373                 ippiMaskMeanFuncC1 ippFuncC1 =
   1374                 type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR :
   1375                 type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR :
   1376                 type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR :
   1377                 0;
   1378                 if( ippFuncC1 )
   1379                 {
   1380                     Ipp64f res;
   1381                     if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &res) >= 0 )
   1382                     {
   1383                         CV_IMPL_ADD(CV_IMPL_IPP);
   1384                         return Scalar(res);
   1385                     }
   1386                     setIppErrorStatus();
   1387                 }
   1388                 typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
   1389                 ippiMaskMeanFuncC3 ippFuncC3 =
   1390                 type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR :
   1391                 type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR :
   1392                 type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR :
   1393                 0;
   1394                 if( ippFuncC3 )
   1395                 {
   1396                     Ipp64f res1, res2, res3;
   1397                     if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &res1) >= 0 &&
   1398                         ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &res2) >= 0 &&
   1399                         ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &res3) >= 0 )
   1400                     {
   1401                         CV_IMPL_ADD(CV_IMPL_IPP);
   1402                         return Scalar(res1, res2, res3);
   1403                     }
   1404                     setIppErrorStatus();
   1405                 }
   1406             }
   1407             else
   1408             {
   1409                 typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
   1410                 typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *);
   1411                 ippiMeanFuncHint ippFuncHint =
   1412                     type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R :
   1413                     type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R :
   1414                     type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R :
   1415                     0;
   1416                 ippiMeanFuncNoHint ippFuncNoHint =
   1417                     type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R :
   1418                     type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R :
   1419                     type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R :
   1420                     type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R :
   1421                     type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R :
   1422                     type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R :
   1423                     type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R :
   1424                     type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R :
   1425                     type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R :
   1426                     0;
   1427                 // Make sure only zero or one version of the function pointer is valid
   1428                 CV_Assert(!ippFuncHint || !ippFuncNoHint);
   1429                 if( ippFuncHint || ippFuncNoHint )
   1430                 {
   1431                     Ipp64f res[4];
   1432                     IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
   1433                                     ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res);
   1434                     if( ret >= 0 )
   1435                     {
   1436                         Scalar sc;
   1437                         for( int i = 0; i < cn; i++ )
   1438                             sc[i] = res[i];
   1439                         CV_IMPL_ADD(CV_IMPL_IPP);
   1440                         return sc;
   1441                     }
   1442                     setIppErrorStatus();
   1443                 }
   1444             }
   1445         }
   1446     }
   1447 #endif
   1448 
   1449     SumFunc func = getSumFunc(depth);
   1450 
   1451     CV_Assert( cn <= 4 && func != 0 );
   1452 
   1453     const Mat* arrays[] = {&src, &mask, 0};
   1454     uchar* ptrs[2];
   1455     NAryMatIterator it(arrays, ptrs);
   1456     Scalar s;
   1457     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
   1458     int j, count = 0;
   1459     AutoBuffer<int> _buf;
   1460     int* buf = (int*)&s[0];
   1461     bool blockSum = depth <= CV_16S;
   1462     size_t esz = 0, nz0 = 0;
   1463 
   1464     if( blockSum )
   1465     {
   1466         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
   1467         blockSize = std::min(blockSize, intSumBlockSize);
   1468         _buf.allocate(cn);
   1469         buf = _buf;
   1470 
   1471         for( k = 0; k < cn; k++ )
   1472             buf[k] = 0;
   1473         esz = src.elemSize();
   1474     }
   1475 
   1476     for( size_t i = 0; i < it.nplanes; i++, ++it )
   1477     {
   1478         for( j = 0; j < total; j += blockSize )
   1479         {
   1480             int bsz = std::min(total - j, blockSize);
   1481             int nz = func( ptrs[0], ptrs[1], (uchar*)buf, bsz, cn );
   1482             count += nz;
   1483             nz0 += nz;
   1484             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
   1485             {
   1486                 for( k = 0; k < cn; k++ )
   1487                 {
   1488                     s[k] += buf[k];
   1489                     buf[k] = 0;
   1490                 }
   1491                 count = 0;
   1492             }
   1493             ptrs[0] += bsz*esz;
   1494             if( ptrs[1] )
   1495                 ptrs[1] += bsz;
   1496         }
   1497     }
   1498     return s*(nz0 ? 1./nz0 : 0);
   1499 }
   1500 
   1501 #ifdef HAVE_OPENCL
   1502 
   1503 namespace cv {
   1504 
   1505 static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
   1506 {
   1507     bool haveMask = _mask.kind() != _InputArray::NONE;
   1508     int nz = haveMask ? -1 : (int)_src.total();
   1509     Scalar mean, stddev;
   1510 
   1511     {
   1512         int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   1513         bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
   1514                 isContinuous = _src.isContinuous(),
   1515                 isMaskContinuous = _mask.isContinuous();
   1516         const ocl::Device &defDev = ocl::Device::getDefault();
   1517         int groups = defDev.maxComputeUnits();
   1518         if (defDev.isIntel())
   1519         {
   1520             static const int subSliceEUCount = 10;
   1521             groups = (groups / subSliceEUCount) * 2;
   1522         }
   1523         size_t wgs = defDev.maxWorkGroupSize();
   1524 
   1525         int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
   1526                 dtype = CV_MAKE_TYPE(ddepth, cn),
   1527                 sqdtype = CV_MAKETYPE(sqddepth, cn);
   1528         CV_Assert(!haveMask || _mask.type() == CV_8UC1);
   1529 
   1530         int wgs2_aligned = 1;
   1531         while (wgs2_aligned < (int)wgs)
   1532             wgs2_aligned <<= 1;
   1533         wgs2_aligned >>= 1;
   1534 
   1535         if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
   1536             return false;
   1537 
   1538         char cvt[2][40];
   1539         String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d"
   1540                              " -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s%s"
   1541                              " -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
   1542                              ocl::typeToStr(type), ocl::typeToStr(depth),
   1543                              ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth,
   1544                              ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth),
   1545                              ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]),
   1546                              cn, isContinuous ? " -D HAVE_SRC_CONT" : "",
   1547                              isMaskContinuous ? " -D HAVE_MASK_CONT" : "",
   1548                              ocl::convertTypeStr(depth, ddepth, cn, cvt[1]),
   1549                              (int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "",
   1550                              doubleSupport ? " -D DOUBLE_SUPPORT" : "");
   1551 
   1552         ocl::Kernel k("meanStdDev", ocl::core::meanstddev_oclsrc, opts);
   1553         if (k.empty())
   1554             return false;
   1555 
   1556         int dbsize = groups * ((haveMask ? CV_ELEM_SIZE1(CV_32S) : 0) +
   1557                                CV_ELEM_SIZE(sqdtype) + CV_ELEM_SIZE(dtype));
   1558         UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
   1559 
   1560         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
   1561                 dbarg = ocl::KernelArg::PtrWriteOnly(db),
   1562                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
   1563 
   1564         if (haveMask)
   1565             k.args(srcarg, src.cols, (int)src.total(), groups, dbarg, maskarg);
   1566         else
   1567             k.args(srcarg, src.cols, (int)src.total(), groups, dbarg);
   1568 
   1569         size_t globalsize = groups * wgs;
   1570         if (!k.run(1, &globalsize, &wgs, false))
   1571             return false;
   1572 
   1573         typedef Scalar (* part_sum)(Mat m);
   1574         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> };
   1575         Mat dbm = db.getMat(ACCESS_READ);
   1576 
   1577         mean = funcs[ddepth - CV_32S](Mat(1, groups, dtype, dbm.ptr()));
   1578         stddev = funcs[sqddepth - CV_32S](Mat(1, groups, sqdtype, dbm.ptr() + groups * CV_ELEM_SIZE(dtype)));
   1579 
   1580         if (haveMask)
   1581             nz = saturate_cast<int>(funcs[0](Mat(1, groups, CV_32SC1, dbm.ptr() +
   1582                                                  groups * (CV_ELEM_SIZE(dtype) +
   1583                                                            CV_ELEM_SIZE(sqdtype))))[0]);
   1584     }
   1585 
   1586     double total = nz != 0 ? 1.0 / nz : 0;
   1587     int k, j, cn = _src.channels();
   1588     for (int i = 0; i < cn; ++i)
   1589     {
   1590         mean[i] *= total;
   1591         stddev[i] = std::sqrt(std::max(stddev[i] * total - mean[i] * mean[i] , 0.));
   1592     }
   1593 
   1594     for( j = 0; j < 2; j++ )
   1595     {
   1596         const double * const sptr = j == 0 ? &mean[0] : &stddev[0];
   1597         _OutputArray _dst = j == 0 ? _mean : _sdv;
   1598         if( !_dst.needed() )
   1599             continue;
   1600 
   1601         if( !_dst.fixedSize() )
   1602             _dst.create(cn, 1, CV_64F, -1, true);
   1603         Mat dst = _dst.getMat();
   1604         int dcn = (int)dst.total();
   1605         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
   1606                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
   1607         double* dptr = dst.ptr<double>();
   1608         for( k = 0; k < cn; k++ )
   1609             dptr[k] = sptr[k];
   1610         for( ; k < dcn; k++ )
   1611             dptr[k] = 0;
   1612     }
   1613 
   1614     return true;
   1615 }
   1616 
   1617 }
   1618 
   1619 #endif
   1620 
   1621 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
   1622 {
   1623     CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
   1624                ocl_meanStdDev(_src, _mean, _sdv, _mask))
   1625 
   1626     Mat src = _src.getMat(), mask = _mask.getMat();
   1627     CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
   1628 
   1629     int k, cn = src.channels(), depth = src.depth();
   1630 
   1631 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   1632     CV_IPP_CHECK()
   1633     {
   1634         size_t total_size = src.total();
   1635         int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
   1636         if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
   1637         {
   1638             Ipp64f mean_temp[3];
   1639             Ipp64f stddev_temp[3];
   1640             Ipp64f *pmean = &mean_temp[0];
   1641             Ipp64f *pstddev = &stddev_temp[0];
   1642             Mat mean, stddev;
   1643             int dcn_mean = -1;
   1644             if( _mean.needed() )
   1645             {
   1646                 if( !_mean.fixedSize() )
   1647                     _mean.create(cn, 1, CV_64F, -1, true);
   1648                 mean = _mean.getMat();
   1649                 dcn_mean = (int)mean.total();
   1650                 pmean = mean.ptr<Ipp64f>();
   1651             }
   1652             int dcn_stddev = -1;
   1653             if( _sdv.needed() )
   1654             {
   1655                 if( !_sdv.fixedSize() )
   1656                     _sdv.create(cn, 1, CV_64F, -1, true);
   1657                 stddev = _sdv.getMat();
   1658                 dcn_stddev = (int)stddev.total();
   1659                 pstddev = stddev.ptr<Ipp64f>();
   1660             }
   1661             for( int c = cn; c < dcn_mean; c++ )
   1662                 pmean[c] = 0;
   1663             for( int c = cn; c < dcn_stddev; c++ )
   1664                 pstddev[c] = 0;
   1665             IppiSize sz = { cols, rows };
   1666             int type = src.type();
   1667             if( !mask.empty() )
   1668             {
   1669                 typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *);
   1670                 ippiMaskMeanStdDevFuncC1 ippFuncC1 =
   1671                 type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR :
   1672                 type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR :
   1673                 type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR :
   1674                 0;
   1675                 if( ippFuncC1 )
   1676                 {
   1677                     if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, pmean, pstddev) >= 0 )
   1678                     {
   1679                         CV_IMPL_ADD(CV_IMPL_IPP);
   1680                         return;
   1681                     }
   1682                     setIppErrorStatus();
   1683                 }
   1684                 typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
   1685                 ippiMaskMeanStdDevFuncC3 ippFuncC3 =
   1686                 type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR :
   1687                 type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR :
   1688                 type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR :
   1689                 0;
   1690                 if( ippFuncC3 )
   1691                 {
   1692                     if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
   1693                         ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
   1694                         ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
   1695                     {
   1696                         CV_IMPL_ADD(CV_IMPL_IPP);
   1697                         return;
   1698                     }
   1699                     setIppErrorStatus();
   1700                 }
   1701             }
   1702             else
   1703             {
   1704                 typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *);
   1705                 ippiMeanStdDevFuncC1 ippFuncC1 =
   1706                 type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R :
   1707                 type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R :
   1708 #if (IPP_VERSION_X100 >= 801)
   1709                 type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0
   1710 #endif
   1711                 0;
   1712                 if( ippFuncC1 )
   1713                 {
   1714                     if( ippFuncC1(src.ptr(), (int)src.step[0], sz, pmean, pstddev) >= 0 )
   1715                     {
   1716                         CV_IMPL_ADD(CV_IMPL_IPP);
   1717                         return;
   1718                     }
   1719                     setIppErrorStatus();
   1720                 }
   1721                 typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
   1722                 ippiMeanStdDevFuncC3 ippFuncC3 =
   1723                 type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR :
   1724                 type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR :
   1725                 type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR :
   1726                 0;
   1727                 if( ippFuncC3 )
   1728                 {
   1729                     if( ippFuncC3(src.ptr(), (int)src.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
   1730                         ippFuncC3(src.ptr(), (int)src.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
   1731                         ippFuncC3(src.ptr(), (int)src.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
   1732                     {
   1733                         CV_IMPL_ADD(CV_IMPL_IPP);
   1734                         return;
   1735                     }
   1736                     setIppErrorStatus();
   1737                 }
   1738             }
   1739         }
   1740     }
   1741 #endif
   1742 
   1743 
   1744     SumSqrFunc func = getSumSqrTab(depth);
   1745 
   1746     CV_Assert( func != 0 );
   1747 
   1748     const Mat* arrays[] = {&src, &mask, 0};
   1749     uchar* ptrs[2];
   1750     NAryMatIterator it(arrays, ptrs);
   1751     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
   1752     int j, count = 0, nz0 = 0;
   1753     AutoBuffer<double> _buf(cn*4);
   1754     double *s = (double*)_buf, *sq = s + cn;
   1755     int *sbuf = (int*)s, *sqbuf = (int*)sq;
   1756     bool blockSum = depth <= CV_16S, blockSqSum = depth <= CV_8S;
   1757     size_t esz = 0;
   1758 
   1759     for( k = 0; k < cn; k++ )
   1760         s[k] = sq[k] = 0;
   1761 
   1762     if( blockSum )
   1763     {
   1764         intSumBlockSize = 1 << 15;
   1765         blockSize = std::min(blockSize, intSumBlockSize);
   1766         sbuf = (int*)(sq + cn);
   1767         if( blockSqSum )
   1768             sqbuf = sbuf + cn;
   1769         for( k = 0; k < cn; k++ )
   1770             sbuf[k] = sqbuf[k] = 0;
   1771         esz = src.elemSize();
   1772     }
   1773 
   1774     for( size_t i = 0; i < it.nplanes; i++, ++it )
   1775     {
   1776         for( j = 0; j < total; j += blockSize )
   1777         {
   1778             int bsz = std::min(total - j, blockSize);
   1779             int nz = func( ptrs[0], ptrs[1], (uchar*)sbuf, (uchar*)sqbuf, bsz, cn );
   1780             count += nz;
   1781             nz0 += nz;
   1782             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
   1783             {
   1784                 for( k = 0; k < cn; k++ )
   1785                 {
   1786                     s[k] += sbuf[k];
   1787                     sbuf[k] = 0;
   1788                 }
   1789                 if( blockSqSum )
   1790                 {
   1791                     for( k = 0; k < cn; k++ )
   1792                     {
   1793                         sq[k] += sqbuf[k];
   1794                         sqbuf[k] = 0;
   1795                     }
   1796                 }
   1797                 count = 0;
   1798             }
   1799             ptrs[0] += bsz*esz;
   1800             if( ptrs[1] )
   1801                 ptrs[1] += bsz;
   1802         }
   1803     }
   1804 
   1805     double scale = nz0 ? 1./nz0 : 0.;
   1806     for( k = 0; k < cn; k++ )
   1807     {
   1808         s[k] *= scale;
   1809         sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.));
   1810     }
   1811 
   1812     for( j = 0; j < 2; j++ )
   1813     {
   1814         const double* sptr = j == 0 ? s : sq;
   1815         _OutputArray _dst = j == 0 ? _mean : _sdv;
   1816         if( !_dst.needed() )
   1817             continue;
   1818 
   1819         if( !_dst.fixedSize() )
   1820             _dst.create(cn, 1, CV_64F, -1, true);
   1821         Mat dst = _dst.getMat();
   1822         int dcn = (int)dst.total();
   1823         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
   1824                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
   1825         double* dptr = dst.ptr<double>();
   1826         for( k = 0; k < cn; k++ )
   1827             dptr[k] = sptr[k];
   1828         for( ; k < dcn; k++ )
   1829             dptr[k] = 0;
   1830     }
   1831 }
   1832 
   1833 /****************************************************************************************\
   1834 *                                       minMaxLoc                                        *
   1835 \****************************************************************************************/
   1836 
   1837 namespace cv
   1838 {
   1839 
   1840 template<typename T, typename WT> static void
   1841 minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal,
   1842             size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx )
   1843 {
   1844     WT minVal = *_minVal, maxVal = *_maxVal;
   1845     size_t minIdx = *_minIdx, maxIdx = *_maxIdx;
   1846 
   1847     if( !mask )
   1848     {
   1849         for( int i = 0; i < len; i++ )
   1850         {
   1851             T val = src[i];
   1852             if( val < minVal )
   1853             {
   1854                 minVal = val;
   1855                 minIdx = startIdx + i;
   1856             }
   1857             if( val > maxVal )
   1858             {
   1859                 maxVal = val;
   1860                 maxIdx = startIdx + i;
   1861             }
   1862         }
   1863     }
   1864     else
   1865     {
   1866         for( int i = 0; i < len; i++ )
   1867         {
   1868             T val = src[i];
   1869             if( mask[i] && val < minVal )
   1870             {
   1871                 minVal = val;
   1872                 minIdx = startIdx + i;
   1873             }
   1874             if( mask[i] && val > maxVal )
   1875             {
   1876                 maxVal = val;
   1877                 maxIdx = startIdx + i;
   1878             }
   1879         }
   1880     }
   1881 
   1882     *_minIdx = minIdx;
   1883     *_maxIdx = maxIdx;
   1884     *_minVal = minVal;
   1885     *_maxVal = maxVal;
   1886 }
   1887 
   1888 static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* maxval,
   1889                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
   1890 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
   1891 
   1892 static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* maxval,
   1893                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
   1894 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
   1895 
   1896 static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int* maxval,
   1897                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
   1898 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
   1899 
   1900 static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* maxval,
   1901                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
   1902 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
   1903 
   1904 static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* maxval,
   1905                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
   1906 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
   1907 
   1908 static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, float* maxval,
   1909                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
   1910 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
   1911 
   1912 static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, double* maxval,
   1913                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
   1914 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
   1915 
   1916 typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, size_t*, int, size_t);
   1917 
   1918 static MinMaxIdxFunc getMinmaxTab(int depth)
   1919 {
   1920     static MinMaxIdxFunc minmaxTab[] =
   1921     {
   1922         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8s),
   1923         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16s),
   1924         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32s),
   1925         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_64f),
   1926         0
   1927     };
   1928 
   1929     return minmaxTab[depth];
   1930 }
   1931 
   1932 static void ofs2idx(const Mat& a, size_t ofs, int* idx)
   1933 {
   1934     int i, d = a.dims;
   1935     if( ofs > 0 )
   1936     {
   1937         ofs--;
   1938         for( i = d-1; i >= 0; i-- )
   1939         {
   1940             int sz = a.size[i];
   1941             idx[i] = (int)(ofs % sz);
   1942             ofs /= sz;
   1943         }
   1944     }
   1945     else
   1946     {
   1947         for( i = d-1; i >= 0; i-- )
   1948             idx[i] = -1;
   1949     }
   1950 }
   1951 
   1952 #ifdef HAVE_OPENCL
   1953 
   1954 template <typename T>
   1955 void getMinMaxRes(const Mat & db, double * minVal, double * maxVal,
   1956                   int* minLoc, int* maxLoc,
   1957                   int groupnum, int cols, double * maxVal2)
   1958 {
   1959     uint index_max = std::numeric_limits<uint>::max();
   1960     T minval = std::numeric_limits<T>::max();
   1961     T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min(), maxval2 = maxval;
   1962     uint minloc = index_max, maxloc = index_max;
   1963 
   1964     int index = 0;
   1965     const T * minptr = NULL, * maxptr = NULL, * maxptr2 = NULL;
   1966     const uint * minlocptr = NULL, * maxlocptr = NULL;
   1967     if (minVal || minLoc)
   1968     {
   1969         minptr = db.ptr<T>();
   1970         index += sizeof(T) * groupnum;
   1971     }
   1972     if (maxVal || maxLoc)
   1973     {
   1974         maxptr = (const T *)(db.ptr() + index);
   1975         index += sizeof(T) * groupnum;
   1976     }
   1977     if (minLoc)
   1978     {
   1979         minlocptr = (const uint *)(db.ptr() + index);
   1980         index += sizeof(uint) * groupnum;
   1981     }
   1982     if (maxLoc)
   1983     {
   1984         maxlocptr = (const uint *)(db.ptr() + index);
   1985         index += sizeof(uint) * groupnum;
   1986     }
   1987     if (maxVal2)
   1988         maxptr2 = (const T *)(db.ptr() + index);
   1989 
   1990     for (int i = 0; i < groupnum; i++)
   1991     {
   1992         if (minptr && minptr[i] <= minval)
   1993         {
   1994             if (minptr[i] == minval)
   1995             {
   1996                 if (minlocptr)
   1997                     minloc = std::min(minlocptr[i], minloc);
   1998             }
   1999             else
   2000             {
   2001                 if (minlocptr)
   2002                     minloc = minlocptr[i];
   2003                 minval = minptr[i];
   2004             }
   2005         }
   2006         if (maxptr && maxptr[i] >= maxval)
   2007         {
   2008             if (maxptr[i] == maxval)
   2009             {
   2010                 if (maxlocptr)
   2011                     maxloc = std::min(maxlocptr[i], maxloc);
   2012             }
   2013             else
   2014             {
   2015                 if (maxlocptr)
   2016                     maxloc = maxlocptr[i];
   2017                 maxval = maxptr[i];
   2018             }
   2019         }
   2020         if (maxptr2 && maxptr2[i] > maxval2)
   2021             maxval2 = maxptr2[i];
   2022     }
   2023     bool zero_mask = (minLoc && minloc == index_max) ||
   2024             (maxLoc && maxloc == index_max);
   2025 
   2026     if (minVal)
   2027         *minVal = zero_mask ? 0 : (double)minval;
   2028     if (maxVal)
   2029         *maxVal = zero_mask ? 0 : (double)maxval;
   2030     if (maxVal2)
   2031         *maxVal2 = zero_mask ? 0 : (double)maxval2;
   2032 
   2033     if (minLoc)
   2034     {
   2035         minLoc[0] = zero_mask ? -1 : minloc / cols;
   2036         minLoc[1] = zero_mask ? -1 : minloc % cols;
   2037     }
   2038     if (maxLoc)
   2039     {
   2040         maxLoc[0] = zero_mask ? -1 : maxloc / cols;
   2041         maxLoc[1] = zero_mask ? -1 : maxloc % cols;
   2042     }
   2043 }
   2044 
   2045 typedef void (*getMinMaxResFunc)(const Mat & db, double * minVal, double * maxVal,
   2046                                  int * minLoc, int *maxLoc, int gropunum, int cols, double * maxVal2);
   2047 
   2048 static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
   2049                            int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), double * maxVal2 = NULL)
   2050 {
   2051     const ocl::Device & dev = ocl::Device::getDefault();
   2052 
   2053 #ifdef ANDROID
   2054     if (dev.isNVidia())
   2055         return false;
   2056 #endif
   2057 
   2058     bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
   2059         haveSrc2 = _src2.kind() != _InputArray::NONE;
   2060     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
   2061             kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src, _src2));
   2062 
   2063     // disabled following modes since it occasionally fails on AMD devices (e.g. A10-6800K, sep. 2014)
   2064     if ((haveMask || type == CV_32FC1) && dev.isAMD())
   2065         return false;
   2066 
   2067     CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
   2068               (cn >= 1 && !minLoc && !maxLoc) );
   2069 
   2070     if (ddepth < 0)
   2071         ddepth = depth;
   2072 
   2073     CV_Assert(!haveSrc2 || _src2.type() == type);
   2074 
   2075     if (depth == CV_32S)
   2076         return false;
   2077 
   2078     if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
   2079         return false;
   2080 
   2081     int groupnum = dev.maxComputeUnits();
   2082     size_t wgs = dev.maxWorkGroupSize();
   2083 
   2084     int wgs2_aligned = 1;
   2085     while (wgs2_aligned < (int)wgs)
   2086         wgs2_aligned <<= 1;
   2087     wgs2_aligned >>= 1;
   2088 
   2089     bool needMinVal = minVal || minLoc, needMinLoc = minLoc != NULL,
   2090             needMaxVal = maxVal || maxLoc, needMaxLoc = maxLoc != NULL;
   2091 
   2092     // in case of mask we must know whether mask is filled with zeros or not
   2093     // so let's calculate min or max location, if it's undefined, so mask is zeros
   2094     if (!(needMaxLoc || needMinLoc) && haveMask)
   2095     {
   2096         if (needMinVal)
   2097             needMinLoc = true;
   2098         else
   2099             needMaxLoc = true;
   2100     }
   2101 
   2102     char cvt[2][40];
   2103     String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
   2104                          " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
   2105                          " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d -D convertFromU=%s",
   2106                          depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
   2107                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
   2108                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
   2109                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
   2110                          _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
   2111                          needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
   2112                          needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
   2113                          ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
   2114                          ocl::convertTypeStr(depth, ddepth, kercn, cvt[0]),
   2115                          absValues ? " -D OP_ABS" : "",
   2116                          haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
   2117                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth,
   2118                          depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvt[1]) : "noconvert");
   2119 
   2120     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
   2121     if (k.empty())
   2122         return false;
   2123 
   2124     int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
   2125             dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
   2126                                  (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) +
   2127                                  (maxVal2 ? esz : 0));
   2128     UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
   2129 
   2130     if (cn > 1 && !haveMask)
   2131     {
   2132         src = src.reshape(1);
   2133         src2 = src2.reshape(1);
   2134     }
   2135 
   2136     if (haveSrc2)
   2137     {
   2138         if (!haveMask)
   2139             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
   2140                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2));
   2141         else
   2142             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
   2143                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask),
   2144                    ocl::KernelArg::ReadOnlyNoSize(src2));
   2145     }
   2146     else
   2147     {
   2148         if (!haveMask)
   2149             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
   2150                    groupnum, ocl::KernelArg::PtrWriteOnly(db));
   2151         else
   2152             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
   2153                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
   2154     }
   2155 
   2156     size_t globalsize = groupnum * wgs;
   2157     if (!k.run(1, &globalsize, &wgs, true))
   2158         return false;
   2159 
   2160     static const getMinMaxResFunc functab[7] =
   2161     {
   2162         getMinMaxRes<uchar>,
   2163         getMinMaxRes<char>,
   2164         getMinMaxRes<ushort>,
   2165         getMinMaxRes<short>,
   2166         getMinMaxRes<int>,
   2167         getMinMaxRes<float>,
   2168         getMinMaxRes<double>
   2169     };
   2170 
   2171     getMinMaxResFunc func = functab[ddepth];
   2172 
   2173     int locTemp[2];
   2174     func(db.getMat(ACCESS_READ), minVal, maxVal,
   2175          needMinLoc ? minLoc ? minLoc : locTemp : minLoc,
   2176          needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc,
   2177          groupnum, src.cols, maxVal2);
   2178 
   2179     return true;
   2180 }
   2181 
   2182 #endif
   2183 
   2184 }
   2185 
   2186 void cv::minMaxIdx(InputArray _src, double* minVal,
   2187                    double* maxVal, int* minIdx, int* maxIdx,
   2188                    InputArray _mask)
   2189 {
   2190     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   2191     CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
   2192         (cn > 1 && _mask.empty() && !minIdx && !maxIdx) );
   2193 
   2194     CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2  && (_mask.empty() || _src.size() == _mask.size()),
   2195                ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask))
   2196 
   2197     Mat src = _src.getMat(), mask = _mask.getMat();
   2198 
   2199 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   2200     CV_IPP_CHECK()
   2201     {
   2202         size_t total_size = src.total();
   2203         int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
   2204         if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
   2205         {
   2206             IppiSize sz = { cols * cn, rows };
   2207 
   2208             if( !mask.empty() )
   2209             {
   2210                 typedef IppStatus (CV_STDCALL* ippiMaskMinMaxIndxFuncC1)(const void *, int, const void *, int,
   2211                                                                          IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
   2212 
   2213                 CV_SUPPRESS_DEPRECATED_START
   2214                 ippiMaskMinMaxIndxFuncC1 ippFuncC1 =
   2215                     type == CV_8UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1MR :
   2216                     type == CV_8SC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1MR :
   2217                     type == CV_16UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1MR :
   2218                     type == CV_32FC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1MR : 0;
   2219                 CV_SUPPRESS_DEPRECATED_END
   2220 
   2221                 if( ippFuncC1 )
   2222                 {
   2223                     Ipp32f min, max;
   2224                     IppiPoint minp, maxp;
   2225                     if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
   2226                     {
   2227                         if( minVal )
   2228                             *minVal = (double)min;
   2229                         if( maxVal )
   2230                             *maxVal = (double)max;
   2231                         if( !minp.x && !minp.y && !maxp.x && !maxp.y && !mask.ptr()[0] )
   2232                             minp.x = maxp.x = -1;
   2233                         if( minIdx )
   2234                         {
   2235                             size_t minidx = minp.y * cols + minp.x + 1;
   2236                             ofs2idx(src, minidx, minIdx);
   2237                         }
   2238                         if( maxIdx )
   2239                         {
   2240                             size_t maxidx = maxp.y * cols + maxp.x + 1;
   2241                             ofs2idx(src, maxidx, maxIdx);
   2242                         }
   2243                         CV_IMPL_ADD(CV_IMPL_IPP);
   2244                         return;
   2245                     }
   2246                     setIppErrorStatus();
   2247                 }
   2248             }
   2249             else
   2250             {
   2251                 typedef IppStatus (CV_STDCALL* ippiMinMaxIndxFuncC1)(const void *, int, IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
   2252 
   2253                 CV_SUPPRESS_DEPRECATED_START
   2254                 ippiMinMaxIndxFuncC1 ippFuncC1 =
   2255                     depth == CV_8U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1R :
   2256                     depth == CV_8S ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1R :
   2257                     depth == CV_16U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1R :
   2258                 #if !((defined _MSC_VER && defined _M_IX86) || defined __i386__)
   2259                     depth == CV_32F ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1R :
   2260                 #endif
   2261                     0;
   2262                 CV_SUPPRESS_DEPRECATED_END
   2263 
   2264                 if( ippFuncC1 )
   2265                 {
   2266                     Ipp32f min, max;
   2267                     IppiPoint minp, maxp;
   2268                     if( ippFuncC1(src.ptr(), (int)src.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
   2269                     {
   2270                         if( minVal )
   2271                             *minVal = (double)min;
   2272                         if( maxVal )
   2273                             *maxVal = (double)max;
   2274                         if( minIdx )
   2275                         {
   2276                             size_t minidx = minp.y * cols + minp.x + 1;
   2277                             ofs2idx(src, minidx, minIdx);
   2278                         }
   2279                         if( maxIdx )
   2280                         {
   2281                             size_t maxidx = maxp.y * cols + maxp.x + 1;
   2282                             ofs2idx(src, maxidx, maxIdx);
   2283                         }
   2284                         CV_IMPL_ADD(CV_IMPL_IPP);
   2285                         return;
   2286                     }
   2287                     setIppErrorStatus();
   2288                 }
   2289             }
   2290         }
   2291     }
   2292 #endif
   2293 
   2294     MinMaxIdxFunc func = getMinmaxTab(depth);
   2295     CV_Assert( func != 0 );
   2296 
   2297     const Mat* arrays[] = {&src, &mask, 0};
   2298     uchar* ptrs[2];
   2299     NAryMatIterator it(arrays, ptrs);
   2300 
   2301     size_t minidx = 0, maxidx = 0;
   2302     int iminval = INT_MAX, imaxval = INT_MIN;
   2303     float fminval = FLT_MAX, fmaxval = -FLT_MAX;
   2304     double dminval = DBL_MAX, dmaxval = -DBL_MAX;
   2305     size_t startidx = 1;
   2306     int *minval = &iminval, *maxval = &imaxval;
   2307     int planeSize = (int)it.size*cn;
   2308 
   2309     if( depth == CV_32F )
   2310         minval = (int*)&fminval, maxval = (int*)&fmaxval;
   2311     else if( depth == CV_64F )
   2312         minval = (int*)&dminval, maxval = (int*)&dmaxval;
   2313 
   2314     for( size_t i = 0; i < it.nplanes; i++, ++it, startidx += planeSize )
   2315         func( ptrs[0], ptrs[1], minval, maxval, &minidx, &maxidx, planeSize, startidx );
   2316 
   2317     if( minidx == 0 )
   2318         dminval = dmaxval = 0;
   2319     else if( depth == CV_32F )
   2320         dminval = fminval, dmaxval = fmaxval;
   2321     else if( depth <= CV_32S )
   2322         dminval = iminval, dmaxval = imaxval;
   2323 
   2324     if( minVal )
   2325         *minVal = dminval;
   2326     if( maxVal )
   2327         *maxVal = dmaxval;
   2328 
   2329     if( minIdx )
   2330         ofs2idx(src, minidx, minIdx);
   2331     if( maxIdx )
   2332         ofs2idx(src, maxidx, maxIdx);
   2333 }
   2334 
   2335 void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
   2336                     Point* minLoc, Point* maxLoc, InputArray mask )
   2337 {
   2338     CV_Assert(_img.dims() <= 2);
   2339 
   2340     minMaxIdx(_img, minVal, maxVal, (int*)minLoc, (int*)maxLoc, mask);
   2341     if( minLoc )
   2342         std::swap(minLoc->x, minLoc->y);
   2343     if( maxLoc )
   2344         std::swap(maxLoc->x, maxLoc->y);
   2345 }
   2346 
   2347 /****************************************************************************************\
   2348 *                                         norm                                           *
   2349 \****************************************************************************************/
   2350 
   2351 namespace cv
   2352 {
   2353 
   2354 template<typename T, typename ST> int
   2355 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
   2356 {
   2357     ST result = *_result;
   2358     if( !mask )
   2359     {
   2360         result = std::max(result, normInf<T, ST>(src, len*cn));
   2361     }
   2362     else
   2363     {
   2364         for( int i = 0; i < len; i++, src += cn )
   2365             if( mask[i] )
   2366             {
   2367                 for( int k = 0; k < cn; k++ )
   2368                     result = std::max(result, ST(cv_abs(src[k])));
   2369             }
   2370     }
   2371     *_result = result;
   2372     return 0;
   2373 }
   2374 
   2375 template<typename T, typename ST> int
   2376 normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
   2377 {
   2378     ST result = *_result;
   2379     if( !mask )
   2380     {
   2381         result += normL1<T, ST>(src, len*cn);
   2382     }
   2383     else
   2384     {
   2385         for( int i = 0; i < len; i++, src += cn )
   2386             if( mask[i] )
   2387             {
   2388                 for( int k = 0; k < cn; k++ )
   2389                     result += cv_abs(src[k]);
   2390             }
   2391     }
   2392     *_result = result;
   2393     return 0;
   2394 }
   2395 
   2396 template<typename T, typename ST> int
   2397 normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
   2398 {
   2399     ST result = *_result;
   2400     if( !mask )
   2401     {
   2402         result += normL2Sqr<T, ST>(src, len*cn);
   2403     }
   2404     else
   2405     {
   2406         for( int i = 0; i < len; i++, src += cn )
   2407             if( mask[i] )
   2408             {
   2409                 for( int k = 0; k < cn; k++ )
   2410                 {
   2411                     T v = src[k];
   2412                     result += (ST)v*v;
   2413                 }
   2414             }
   2415     }
   2416     *_result = result;
   2417     return 0;
   2418 }
   2419 
   2420 template<typename T, typename ST> int
   2421 normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
   2422 {
   2423     ST result = *_result;
   2424     if( !mask )
   2425     {
   2426         result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
   2427     }
   2428     else
   2429     {
   2430         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
   2431             if( mask[i] )
   2432             {
   2433                 for( int k = 0; k < cn; k++ )
   2434                     result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
   2435             }
   2436     }
   2437     *_result = result;
   2438     return 0;
   2439 }
   2440 
   2441 template<typename T, typename ST> int
   2442 normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
   2443 {
   2444     ST result = *_result;
   2445     if( !mask )
   2446     {
   2447         result += normL1<T, ST>(src1, src2, len*cn);
   2448     }
   2449     else
   2450     {
   2451         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
   2452             if( mask[i] )
   2453             {
   2454                 for( int k = 0; k < cn; k++ )
   2455                     result += std::abs(src1[k] - src2[k]);
   2456             }
   2457     }
   2458     *_result = result;
   2459     return 0;
   2460 }
   2461 
   2462 template<typename T, typename ST> int
   2463 normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
   2464 {
   2465     ST result = *_result;
   2466     if( !mask )
   2467     {
   2468         result += normL2Sqr<T, ST>(src1, src2, len*cn);
   2469     }
   2470     else
   2471     {
   2472         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
   2473             if( mask[i] )
   2474             {
   2475                 for( int k = 0; k < cn; k++ )
   2476                 {
   2477                     ST v = src1[k] - src2[k];
   2478                     result += v*v;
   2479                 }
   2480             }
   2481     }
   2482     *_result = result;
   2483     return 0;
   2484 }
   2485 
   2486 Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
   2487 {
   2488     return cv::hal::normHamming(a, b, size);
   2489 }
   2490 
   2491 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
   2492     static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
   2493 { return norm##L##_(src, mask, r, len, cn); } \
   2494     static int normDiff##L##_##suffix(const type* src1, const type* src2, \
   2495     const uchar* mask, ntype* r, int len, int cn) \
   2496 { return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
   2497 
   2498 #define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
   2499     CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
   2500     CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
   2501     CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
   2502 
   2503 CV_DEF_NORM_ALL(8u, uchar, int, int, int)
   2504 CV_DEF_NORM_ALL(8s, schar, int, int, int)
   2505 CV_DEF_NORM_ALL(16u, ushort, int, int, double)
   2506 CV_DEF_NORM_ALL(16s, short, int, int, double)
   2507 CV_DEF_NORM_ALL(32s, int, int, double, double)
   2508 CV_DEF_NORM_ALL(32f, float, float, double, double)
   2509 CV_DEF_NORM_ALL(64f, double, double, double, double)
   2510 
   2511 
   2512 typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
   2513 typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
   2514 
   2515 static NormFunc getNormFunc(int normType, int depth)
   2516 {
   2517     static NormFunc normTab[3][8] =
   2518     {
   2519         {
   2520             (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
   2521             (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
   2522         },
   2523         {
   2524             (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
   2525             (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
   2526         },
   2527         {
   2528             (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
   2529             (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
   2530         }
   2531     };
   2532 
   2533     return normTab[normType][depth];
   2534 }
   2535 
   2536 static NormDiffFunc getNormDiffFunc(int normType, int depth)
   2537 {
   2538     static NormDiffFunc normDiffTab[3][8] =
   2539     {
   2540         {
   2541             (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
   2542             (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
   2543             (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
   2544             (NormDiffFunc)normDiffInf_64f, 0
   2545         },
   2546         {
   2547             (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
   2548             (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
   2549             (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
   2550             (NormDiffFunc)normDiffL1_64f, 0
   2551         },
   2552         {
   2553             (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
   2554             (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
   2555             (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
   2556             (NormDiffFunc)normDiffL2_64f, 0
   2557         }
   2558     };
   2559 
   2560     return normDiffTab[normType][depth];
   2561 }
   2562 
   2563 #ifdef HAVE_OPENCL
   2564 
   2565 static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
   2566 {
   2567     const ocl::Device & d = ocl::Device::getDefault();
   2568 
   2569 #ifdef ANDROID
   2570     if (d.isNVidia())
   2571         return false;
   2572 #endif
   2573 
   2574     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   2575     bool doubleSupport = d.doubleFPConfig() > 0,
   2576             haveMask = _mask.kind() != _InputArray::NONE;
   2577 
   2578     if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
   2579          (!doubleSupport && depth == CV_64F))
   2580         return false;
   2581 
   2582     UMat src = _src.getUMat();
   2583 
   2584     if (normType == NORM_INF)
   2585     {
   2586         if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
   2587                            std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U))
   2588             return false;
   2589     }
   2590     else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
   2591     {
   2592         Scalar sc;
   2593         bool unstype = depth == CV_8U || depth == CV_16U;
   2594 
   2595         if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
   2596                     OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
   2597             return false;
   2598 
   2599         if (!haveMask)
   2600             cn = 1;
   2601 
   2602         double s = 0.0;
   2603         for (int i = 0; i < cn; ++i)
   2604             s += sc[i];
   2605 
   2606         result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
   2607     }
   2608 
   2609     return true;
   2610 }
   2611 
   2612 #endif
   2613 
   2614 }
   2615 
   2616 double cv::norm( InputArray _src, int normType, InputArray _mask )
   2617 {
   2618     normType &= NORM_TYPE_MASK;
   2619     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
   2620                normType == NORM_L2 || normType == NORM_L2SQR ||
   2621                ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );
   2622 
   2623 #ifdef HAVE_OPENCL
   2624     double _result = 0;
   2625     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
   2626                 ocl_norm(_src, normType, _mask, _result),
   2627                 _result)
   2628 #endif
   2629 
   2630     Mat src = _src.getMat(), mask = _mask.getMat();
   2631     int depth = src.depth(), cn = src.channels();
   2632 
   2633 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   2634     CV_IPP_CHECK()
   2635     {
   2636         size_t total_size = src.total();
   2637         int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
   2638 
   2639         if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous()))
   2640             && cols > 0 && (size_t)rows*cols == total_size
   2641             && (normType == NORM_INF || normType == NORM_L1 ||
   2642                 normType == NORM_L2 || normType == NORM_L2SQR) )
   2643         {
   2644             IppiSize sz = { cols, rows };
   2645             int type = src.type();
   2646             if( !mask.empty() )
   2647             {
   2648                 typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
   2649                 ippiMaskNormFuncC1 ippFuncC1 =
   2650                     normType == NORM_INF ?
   2651                     (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8u_C1MR :
   2652                     type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8s_C1MR :
   2653     //                type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_16u_C1MR :
   2654                     type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_32f_C1MR :
   2655                     0) :
   2656                 normType == NORM_L1 ?
   2657                     (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8u_C1MR :
   2658                     type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8s_C1MR :
   2659                     type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_16u_C1MR :
   2660                     type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_32f_C1MR :
   2661                     0) :
   2662                 normType == NORM_L2 || normType == NORM_L2SQR ?
   2663                     (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8u_C1MR :
   2664                     type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8s_C1MR :
   2665                     type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_16u_C1MR :
   2666                     type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_32f_C1MR :
   2667                     0) : 0;
   2668                 if( ippFuncC1 )
   2669                 {
   2670                     Ipp64f norm;
   2671                     if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
   2672                     {
   2673                         CV_IMPL_ADD(CV_IMPL_IPP);
   2674                         return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
   2675                     }
   2676 
   2677                     setIppErrorStatus();
   2678                 }
   2679                 /*typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
   2680                 ippiMaskNormFuncC3 ippFuncC3 =
   2681                     normType == NORM_INF ?
   2682                     (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
   2683                     type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8s_C3CMR :
   2684                     type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR :
   2685                     type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR :
   2686                     0) :
   2687                 normType == NORM_L1 ?
   2688                     (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR :
   2689                     type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8s_C3CMR :
   2690                     type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR :
   2691                     type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR :
   2692                     0) :
   2693                 normType == NORM_L2 || normType == NORM_L2SQR ?
   2694                     (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR :
   2695                     type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8s_C3CMR :
   2696                     type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR :
   2697                     type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR :
   2698                     0) : 0;
   2699                 if( ippFuncC3 )
   2700                 {
   2701                     Ipp64f norm1, norm2, norm3;
   2702                     if( ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
   2703                         ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
   2704                         ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
   2705                     {
   2706                         Ipp64f norm =
   2707                             normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
   2708                             normType == NORM_L1 ? norm1 + norm2 + norm3 :
   2709                             normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
   2710                             0;
   2711                         CV_IMPL_ADD(CV_IMPL_IPP);
   2712                         return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
   2713                     }
   2714                     setIppErrorStatus();
   2715                 }*/
   2716             }
   2717             else
   2718             {
   2719                 typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
   2720                 typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *);
   2721                 ippiNormFuncHint ippFuncHint =
   2722                     normType == NORM_L1 ?
   2723                     (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R :
   2724                     type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L1_32f_C3R :
   2725                     type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L1_32f_C4R :
   2726                     0) :
   2727                     normType == NORM_L2 || normType == NORM_L2SQR ?
   2728                     (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R :
   2729                     type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L2_32f_C3R :
   2730                     type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L2_32f_C4R :
   2731                     0) : 0;
   2732                 ippiNormFuncNoHint ippFuncNoHint =
   2733                     normType == NORM_INF ?
   2734                     (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R :
   2735                     type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C3R :
   2736                     type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C4R :
   2737                     type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R :
   2738                     type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C3R :
   2739                     type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C4R :
   2740                     type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R :
   2741 #if (IPP_VERSION_X100 >= 801)
   2742                     type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
   2743                     type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
   2744 #endif
   2745                     type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R :
   2746                     type == CV_32FC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C3R :
   2747                     type == CV_32FC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C4R :
   2748                     0) :
   2749                     normType == NORM_L1 ?
   2750                     (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R :
   2751                     type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C3R :
   2752                     type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C4R :
   2753                     type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R :
   2754                     type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C3R :
   2755                     type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C4R :
   2756                     type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R :
   2757                     type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C3R :
   2758                     type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C4R :
   2759                     0) :
   2760                     normType == NORM_L2 || normType == NORM_L2SQR ?
   2761                     (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R :
   2762                     type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C3R :
   2763                     type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C4R :
   2764                     type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R :
   2765                     type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C3R :
   2766                     type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C4R :
   2767                     type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R :
   2768                     type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C3R :
   2769                     type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C4R :
   2770                     0) : 0;
   2771                 // Make sure only zero or one version of the function pointer is valid
   2772                 CV_Assert(!ippFuncHint || !ippFuncNoHint);
   2773                 if( ippFuncHint || ippFuncNoHint )
   2774                 {
   2775                     Ipp64f norm_array[4];
   2776                     IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, norm_array, ippAlgHintAccurate) :
   2777                                     ippFuncNoHint(src.ptr(), (int)src.step[0], sz, norm_array);
   2778                     if( ret >= 0 )
   2779                     {
   2780                         Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
   2781                         for( int i = 1; i < cn; i++ )
   2782                         {
   2783                             norm =
   2784                                 normType == NORM_INF ? std::max(norm, norm_array[i]) :
   2785                                 normType == NORM_L1 ? norm + norm_array[i] :
   2786                                 normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
   2787                                 0;
   2788                         }
   2789                         CV_IMPL_ADD(CV_IMPL_IPP);
   2790                         return normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm;
   2791                     }
   2792                     setIppErrorStatus();
   2793                 }
   2794             }
   2795         }
   2796     }
   2797 #endif
   2798 
   2799     if( src.isContinuous() && mask.empty() )
   2800     {
   2801         size_t len = src.total()*cn;
   2802         if( len == (size_t)(int)len )
   2803         {
   2804             if( depth == CV_32F )
   2805             {
   2806                 const float* data = src.ptr<float>();
   2807 
   2808                 if( normType == NORM_L2 )
   2809                 {
   2810                     double result = 0;
   2811                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
   2812                     return std::sqrt(result);
   2813                 }
   2814                 if( normType == NORM_L2SQR )
   2815                 {
   2816                     double result = 0;
   2817                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
   2818                     return result;
   2819                 }
   2820                 if( normType == NORM_L1 )
   2821                 {
   2822                     double result = 0;
   2823                     GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
   2824                     return result;
   2825                 }
   2826                 if( normType == NORM_INF )
   2827                 {
   2828                     float result = 0;
   2829                     GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
   2830                     return result;
   2831                 }
   2832             }
   2833             if( depth == CV_8U )
   2834             {
   2835                 const uchar* data = src.ptr<uchar>();
   2836 
   2837                 if( normType == NORM_HAMMING )
   2838                 {
   2839                     return hal::normHamming(data, (int)len);
   2840                 }
   2841 
   2842                 if( normType == NORM_HAMMING2 )
   2843                 {
   2844                     return hal::normHamming(data, (int)len, 2);
   2845                 }
   2846             }
   2847         }
   2848     }
   2849 
   2850     CV_Assert( mask.empty() || mask.type() == CV_8U );
   2851 
   2852     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
   2853     {
   2854         if( !mask.empty() )
   2855         {
   2856             Mat temp;
   2857             bitwise_and(src, mask, temp);
   2858             return norm(temp, normType);
   2859         }
   2860         int cellSize = normType == NORM_HAMMING ? 1 : 2;
   2861 
   2862         const Mat* arrays[] = {&src, 0};
   2863         uchar* ptrs[1];
   2864         NAryMatIterator it(arrays, ptrs);
   2865         int total = (int)it.size;
   2866         int result = 0;
   2867 
   2868         for( size_t i = 0; i < it.nplanes; i++, ++it )
   2869         {
   2870             result += hal::normHamming(ptrs[0], total, cellSize);
   2871         }
   2872 
   2873         return result;
   2874     }
   2875 
   2876     NormFunc func = getNormFunc(normType >> 1, depth);
   2877     CV_Assert( func != 0 );
   2878 
   2879     const Mat* arrays[] = {&src, &mask, 0};
   2880     uchar* ptrs[2];
   2881     union
   2882     {
   2883         double d;
   2884         int i;
   2885         float f;
   2886     }
   2887     result;
   2888     result.d = 0;
   2889     NAryMatIterator it(arrays, ptrs);
   2890     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
   2891     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
   2892             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
   2893     int isum = 0;
   2894     int *ibuf = &result.i;
   2895     size_t esz = 0;
   2896 
   2897     if( blockSum )
   2898     {
   2899         intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
   2900         blockSize = std::min(blockSize, intSumBlockSize);
   2901         ibuf = &isum;
   2902         esz = src.elemSize();
   2903     }
   2904 
   2905     for( size_t i = 0; i < it.nplanes; i++, ++it )
   2906     {
   2907         for( j = 0; j < total; j += blockSize )
   2908         {
   2909             int bsz = std::min(total - j, blockSize);
   2910             func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
   2911             count += bsz;
   2912             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
   2913             {
   2914                 result.d += isum;
   2915                 isum = 0;
   2916                 count = 0;
   2917             }
   2918             ptrs[0] += bsz*esz;
   2919             if( ptrs[1] )
   2920                 ptrs[1] += bsz;
   2921         }
   2922     }
   2923 
   2924     if( normType == NORM_INF )
   2925     {
   2926         if( depth == CV_64F )
   2927             ;
   2928         else if( depth == CV_32F )
   2929             result.d = result.f;
   2930         else
   2931             result.d = result.i;
   2932     }
   2933     else if( normType == NORM_L2 )
   2934         result.d = std::sqrt(result.d);
   2935 
   2936     return result.d;
   2937 }
   2938 
   2939 #ifdef HAVE_OPENCL
   2940 
   2941 namespace cv {
   2942 
   2943 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
   2944 {
   2945 #ifdef ANDROID
   2946     if (ocl::Device::getDefault().isNVidia())
   2947         return false;
   2948 #endif
   2949 
   2950     Scalar sc1, sc2;
   2951     int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   2952     bool relative = (normType & NORM_RELATIVE) != 0;
   2953     normType &= ~NORM_RELATIVE;
   2954     bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
   2955 
   2956     if (normsum)
   2957     {
   2958         if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
   2959                      OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
   2960             return false;
   2961     }
   2962     else
   2963     {
   2964         if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth),
   2965                            false, _src2, relative ? &sc2[0] : NULL))
   2966             return false;
   2967         cn = 1;
   2968     }
   2969 
   2970     double s2 = 0;
   2971     for (int i = 0; i < cn; ++i)
   2972     {
   2973         result += sc1[i];
   2974         if (relative)
   2975             s2 += sc2[i];
   2976     }
   2977 
   2978     if (normType == NORM_L2)
   2979     {
   2980         result = std::sqrt(result);
   2981         if (relative)
   2982             s2 = std::sqrt(s2);
   2983     }
   2984 
   2985     if (relative)
   2986         result /= (s2 + DBL_EPSILON);
   2987 
   2988     return true;
   2989 }
   2990 
   2991 }
   2992 
   2993 #endif
   2994 
   2995 double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
   2996 {
   2997     CV_Assert( _src1.sameSize(_src2) && _src1.type() == _src2.type() );
   2998 
   2999 #ifdef HAVE_OPENCL
   3000     double _result = 0;
   3001     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src1.isUMat()),
   3002                 ocl_norm(_src1, _src2, normType, _mask, _result),
   3003                 _result)
   3004 #endif
   3005 
   3006     if( normType & CV_RELATIVE )
   3007     {
   3008 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   3009         CV_IPP_CHECK()
   3010         {
   3011             Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
   3012 
   3013             normType &= NORM_TYPE_MASK;
   3014             CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
   3015                     ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
   3016             size_t total_size = src1.total();
   3017             int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
   3018             if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
   3019                 && cols > 0 && (size_t)rows*cols == total_size
   3020                 && (normType == NORM_INF || normType == NORM_L1 ||
   3021                     normType == NORM_L2 || normType == NORM_L2SQR) )
   3022             {
   3023                 IppiSize sz = { cols, rows };
   3024                 int type = src1.type();
   3025                 if( !mask.empty() )
   3026                 {
   3027                     typedef IppStatus (CV_STDCALL* ippiMaskNormRelFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
   3028                     ippiMaskNormRelFuncC1 ippFuncC1 =
   3029                         normType == NORM_INF ?
   3030                         (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8u_C1MR :
   3031 #ifndef __APPLE__
   3032                         type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8s_C1MR :
   3033 #endif
   3034                         type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_16u_C1MR :
   3035                         type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_32f_C1MR :
   3036                         0) :
   3037                         normType == NORM_L1 ?
   3038                         (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8u_C1MR :
   3039 #ifndef __APPLE__
   3040                         type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8s_C1MR :
   3041 #endif
   3042                         type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_16u_C1MR :
   3043                         type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_32f_C1MR :
   3044                         0) :
   3045                         normType == NORM_L2 || normType == NORM_L2SQR ?
   3046                         (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8u_C1MR :
   3047                         type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8s_C1MR :
   3048                         type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_16u_C1MR :
   3049                         type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_32f_C1MR :
   3050                         0) : 0;
   3051                     if( ippFuncC1 )
   3052                     {
   3053                         Ipp64f norm;
   3054                         if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
   3055                         {
   3056                             CV_IMPL_ADD(CV_IMPL_IPP);
   3057                             return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
   3058                         }
   3059                         setIppErrorStatus();
   3060                     }
   3061                 }
   3062                 else
   3063                 {
   3064                     typedef IppStatus (CV_STDCALL* ippiNormRelFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
   3065                     typedef IppStatus (CV_STDCALL* ippiNormRelFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
   3066                     ippiNormRelFuncNoHint ippFuncNoHint =
   3067                         normType == NORM_INF ?
   3068                         (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_8u_C1R :
   3069                         type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16u_C1R :
   3070                         type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16s_C1R :
   3071                         type == CV_32FC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_32f_C1R :
   3072                         0) :
   3073                         normType == NORM_L1 ?
   3074                         (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_8u_C1R :
   3075                         type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16u_C1R :
   3076                         type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16s_C1R :
   3077                         0) :
   3078                         normType == NORM_L2 || normType == NORM_L2SQR ?
   3079                         (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_8u_C1R :
   3080                         type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16u_C1R :
   3081                         type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16s_C1R :
   3082                         0) : 0;
   3083                     ippiNormRelFuncHint ippFuncHint =
   3084                         normType == NORM_L1 ?
   3085                         (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L1_32f_C1R :
   3086                         0) :
   3087                         normType == NORM_L2 || normType == NORM_L2SQR ?
   3088                         (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L2_32f_C1R :
   3089                         0) : 0;
   3090                     if (ippFuncNoHint)
   3091                     {
   3092                         Ipp64f norm;
   3093                         if( ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm) >= 0 )
   3094                         {
   3095                             CV_IMPL_ADD(CV_IMPL_IPP);
   3096                             return (double)norm;
   3097                         }
   3098                         setIppErrorStatus();
   3099                     }
   3100                     if (ippFuncHint)
   3101                     {
   3102                         Ipp64f norm;
   3103                         if( ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) >= 0 )
   3104                         {
   3105                             CV_IMPL_ADD(CV_IMPL_IPP);
   3106                             return (double)norm;
   3107                         }
   3108                         setIppErrorStatus();
   3109                     }
   3110                 }
   3111             }
   3112         }
   3113 #endif
   3114         return norm(_src1, _src2, normType & ~CV_RELATIVE, _mask)/(norm(_src2, normType, _mask) + DBL_EPSILON);
   3115     }
   3116 
   3117     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
   3118     int depth = src1.depth(), cn = src1.channels();
   3119 
   3120     normType &= 7;
   3121     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
   3122                normType == NORM_L2 || normType == NORM_L2SQR ||
   3123               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
   3124 
   3125 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   3126     CV_IPP_CHECK()
   3127     {
   3128         size_t total_size = src1.total();
   3129         int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
   3130         if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
   3131             && cols > 0 && (size_t)rows*cols == total_size
   3132             && (normType == NORM_INF || normType == NORM_L1 ||
   3133                 normType == NORM_L2 || normType == NORM_L2SQR) )
   3134         {
   3135             IppiSize sz = { cols, rows };
   3136             int type = src1.type();
   3137             if( !mask.empty() )
   3138             {
   3139                 typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
   3140                 ippiMaskNormDiffFuncC1 ippFuncC1 =
   3141                     normType == NORM_INF ?
   3142                     (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8u_C1MR :
   3143                     type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8s_C1MR :
   3144                     type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_16u_C1MR :
   3145                     type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_32f_C1MR :
   3146                     0) :
   3147                     normType == NORM_L1 ?
   3148                     (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
   3149 #ifndef __APPLE__
   3150                     type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
   3151 #endif
   3152                     type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
   3153                     type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
   3154                     0) :
   3155                     normType == NORM_L2 || normType == NORM_L2SQR ?
   3156                     (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8u_C1MR :
   3157                     type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8s_C1MR :
   3158                     type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_16u_C1MR :
   3159                     type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_32f_C1MR :
   3160                     0) : 0;
   3161                 if( ippFuncC1 )
   3162                 {
   3163                     Ipp64f norm;
   3164                     if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
   3165                     {
   3166                         CV_IMPL_ADD(CV_IMPL_IPP);
   3167                         return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
   3168                     }
   3169                     setIppErrorStatus();
   3170                 }
   3171 #ifndef __APPLE__
   3172                 typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
   3173                 ippiMaskNormDiffFuncC3 ippFuncC3 =
   3174                     normType == NORM_INF ?
   3175                     (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
   3176                     type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8s_C3CMR :
   3177                     type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_16u_C3CMR :
   3178                     type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_32f_C3CMR :
   3179                     0) :
   3180                     normType == NORM_L1 ?
   3181                     (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8u_C3CMR :
   3182                     type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8s_C3CMR :
   3183                     type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_16u_C3CMR :
   3184                     type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_32f_C3CMR :
   3185                     0) :
   3186                     normType == NORM_L2 || normType == NORM_L2SQR ?
   3187                     (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8u_C3CMR :
   3188                     type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8s_C3CMR :
   3189                     type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR :
   3190                     type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR :
   3191                     0) : 0;
   3192                 if( ippFuncC3 )
   3193                 {
   3194                     Ipp64f norm1, norm2, norm3;
   3195                     if( ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
   3196                         ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
   3197                         ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
   3198                     {
   3199                         Ipp64f norm =
   3200                             normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
   3201                             normType == NORM_L1 ? norm1 + norm2 + norm3 :
   3202                             normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
   3203                             0;
   3204                         CV_IMPL_ADD(CV_IMPL_IPP);
   3205                         return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
   3206                     }
   3207                     setIppErrorStatus();
   3208                 }
   3209 #endif
   3210             }
   3211             else
   3212             {
   3213                 typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
   3214                 typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
   3215                 ippiNormDiffFuncHint ippFuncHint =
   3216                     normType == NORM_L1 ?
   3217                     (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R :
   3218                     type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C3R :
   3219                     type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C4R :
   3220                     0) :
   3221                     normType == NORM_L2 || normType == NORM_L2SQR ?
   3222                     (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R :
   3223                     type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C3R :
   3224                     type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C4R :
   3225                     0) : 0;
   3226                 ippiNormDiffFuncNoHint ippFuncNoHint =
   3227                     normType == NORM_INF ?
   3228                     (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R :
   3229                     type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C3R :
   3230                     type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C4R :
   3231                     type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R :
   3232                     type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C3R :
   3233                     type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C4R :
   3234                     type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R :
   3235 #if (IPP_VERSION_X100 >= 801)
   3236                     type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
   3237                     type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
   3238 #endif
   3239                     type == CV_32FC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R :
   3240                     type == CV_32FC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C3R :
   3241                     type == CV_32FC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C4R :
   3242                     0) :
   3243                     normType == NORM_L1 ?
   3244                     (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R :
   3245                     type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C3R :
   3246                     type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C4R :
   3247                     type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R :
   3248                     type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C3R :
   3249                     type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C4R :
   3250 #if !(IPP_VERSION_X100 == 802 && (!defined(IPP_VERSION_UPDATE) || IPP_VERSION_UPDATE <= 1)) // Oct 2014: Accuracy issue with IPP 8.2 / 8.2.1
   3251                     type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R :
   3252 #endif
   3253                     type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C3R :
   3254                     type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C4R :
   3255                     0) :
   3256                     normType == NORM_L2 || normType == NORM_L2SQR ?
   3257                     (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R :
   3258                     type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C3R :
   3259                     type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C4R :
   3260                     type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R :
   3261                     type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C3R :
   3262                     type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C4R :
   3263                     type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R :
   3264                     type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C3R :
   3265                     type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C4R :
   3266                     0) : 0;
   3267                 // Make sure only zero or one version of the function pointer is valid
   3268                 CV_Assert(!ippFuncHint || !ippFuncNoHint);
   3269                 if( ippFuncHint || ippFuncNoHint )
   3270                 {
   3271                     Ipp64f norm_array[4];
   3272                     IppStatus ret = ippFuncHint ? ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array, ippAlgHintAccurate) :
   3273                                     ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array);
   3274                     if( ret >= 0 )
   3275                     {
   3276                         Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
   3277                         for( int i = 1; i < src1.channels(); i++ )
   3278                         {
   3279                             norm =
   3280                                 normType == NORM_INF ? std::max(norm, norm_array[i]) :
   3281                                 normType == NORM_L1 ? norm + norm_array[i] :
   3282                                 normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
   3283                                 0;
   3284                         }
   3285                         CV_IMPL_ADD(CV_IMPL_IPP);
   3286                         return normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm;
   3287                     }
   3288                     setIppErrorStatus();
   3289                 }
   3290             }
   3291         }
   3292     }
   3293 #endif
   3294 
   3295     if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
   3296     {
   3297         size_t len = src1.total()*src1.channels();
   3298         if( len == (size_t)(int)len )
   3299         {
   3300             if( src1.depth() == CV_32F )
   3301             {
   3302                 const float* data1 = src1.ptr<float>();
   3303                 const float* data2 = src2.ptr<float>();
   3304 
   3305                 if( normType == NORM_L2 )
   3306                 {
   3307                     double result = 0;
   3308                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
   3309                     return std::sqrt(result);
   3310                 }
   3311                 if( normType == NORM_L2SQR )
   3312                 {
   3313                     double result = 0;
   3314                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
   3315                     return result;
   3316                 }
   3317                 if( normType == NORM_L1 )
   3318                 {
   3319                     double result = 0;
   3320                     GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1);
   3321                     return result;
   3322                 }
   3323                 if( normType == NORM_INF )
   3324                 {
   3325                     float result = 0;
   3326                     GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1);
   3327                     return result;
   3328                 }
   3329             }
   3330         }
   3331     }
   3332 
   3333     CV_Assert( mask.empty() || mask.type() == CV_8U );
   3334 
   3335     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
   3336     {
   3337         if( !mask.empty() )
   3338         {
   3339             Mat temp;
   3340             bitwise_xor(src1, src2, temp);
   3341             bitwise_and(temp, mask, temp);
   3342             return norm(temp, normType);
   3343         }
   3344         int cellSize = normType == NORM_HAMMING ? 1 : 2;
   3345 
   3346         const Mat* arrays[] = {&src1, &src2, 0};
   3347         uchar* ptrs[2];
   3348         NAryMatIterator it(arrays, ptrs);
   3349         int total = (int)it.size;
   3350         int result = 0;
   3351 
   3352         for( size_t i = 0; i < it.nplanes; i++, ++it )
   3353         {
   3354             result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
   3355         }
   3356 
   3357         return result;
   3358     }
   3359 
   3360     NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
   3361     CV_Assert( func != 0 );
   3362 
   3363     const Mat* arrays[] = {&src1, &src2, &mask, 0};
   3364     uchar* ptrs[3];
   3365     union
   3366     {
   3367         double d;
   3368         float f;
   3369         int i;
   3370         unsigned u;
   3371     }
   3372     result;
   3373     result.d = 0;
   3374     NAryMatIterator it(arrays, ptrs);
   3375     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
   3376     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
   3377             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
   3378     unsigned isum = 0;
   3379     unsigned *ibuf = &result.u;
   3380     size_t esz = 0;
   3381 
   3382     if( blockSum )
   3383     {
   3384         intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
   3385         blockSize = std::min(blockSize, intSumBlockSize);
   3386         ibuf = &isum;
   3387         esz = src1.elemSize();
   3388     }
   3389 
   3390     for( size_t i = 0; i < it.nplanes; i++, ++it )
   3391     {
   3392         for( j = 0; j < total; j += blockSize )
   3393         {
   3394             int bsz = std::min(total - j, blockSize);
   3395             func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
   3396             count += bsz;
   3397             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
   3398             {
   3399                 result.d += isum;
   3400                 isum = 0;
   3401                 count = 0;
   3402             }
   3403             ptrs[0] += bsz*esz;
   3404             ptrs[1] += bsz*esz;
   3405             if( ptrs[2] )
   3406                 ptrs[2] += bsz;
   3407         }
   3408     }
   3409 
   3410     if( normType == NORM_INF )
   3411     {
   3412         if( depth == CV_64F )
   3413             ;
   3414         else if( depth == CV_32F )
   3415             result.d = result.f;
   3416         else
   3417             result.d = result.u;
   3418     }
   3419     else if( normType == NORM_L2 )
   3420         result.d = std::sqrt(result.d);
   3421 
   3422     return result.d;
   3423 }
   3424 
   3425 
   3426 ///////////////////////////////////// batch distance ///////////////////////////////////////
   3427 
   3428 namespace cv
   3429 {
   3430 
   3431 template<typename _Tp, typename _Rt>
   3432 void batchDistL1_(const _Tp* src1, const _Tp* src2, size_t step2,
   3433                   int nvecs, int len, _Rt* dist, const uchar* mask)
   3434 {
   3435     step2 /= sizeof(src2[0]);
   3436     if( !mask )
   3437     {
   3438         for( int i = 0; i < nvecs; i++ )
   3439             dist[i] = normL1<_Tp, _Rt>(src1, src2 + step2*i, len);
   3440     }
   3441     else
   3442     {
   3443         _Rt val0 = std::numeric_limits<_Rt>::max();
   3444         for( int i = 0; i < nvecs; i++ )
   3445             dist[i] = mask[i] ? normL1<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
   3446     }
   3447 }
   3448 
   3449 template<typename _Tp, typename _Rt>
   3450 void batchDistL2Sqr_(const _Tp* src1, const _Tp* src2, size_t step2,
   3451                      int nvecs, int len, _Rt* dist, const uchar* mask)
   3452 {
   3453     step2 /= sizeof(src2[0]);
   3454     if( !mask )
   3455     {
   3456         for( int i = 0; i < nvecs; i++ )
   3457             dist[i] = normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len);
   3458     }
   3459     else
   3460     {
   3461         _Rt val0 = std::numeric_limits<_Rt>::max();
   3462         for( int i = 0; i < nvecs; i++ )
   3463             dist[i] = mask[i] ? normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
   3464     }
   3465 }
   3466 
   3467 template<typename _Tp, typename _Rt>
   3468 void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2,
   3469                   int nvecs, int len, _Rt* dist, const uchar* mask)
   3470 {
   3471     step2 /= sizeof(src2[0]);
   3472     if( !mask )
   3473     {
   3474         for( int i = 0; i < nvecs; i++ )
   3475             dist[i] = std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len));
   3476     }
   3477     else
   3478     {
   3479         _Rt val0 = std::numeric_limits<_Rt>::max();
   3480         for( int i = 0; i < nvecs; i++ )
   3481             dist[i] = mask[i] ? std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)) : val0;
   3482     }
   3483 }
   3484 
   3485 static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
   3486                              int nvecs, int len, int* dist, const uchar* mask)
   3487 {
   3488     step2 /= sizeof(src2[0]);
   3489     if( !mask )
   3490     {
   3491         for( int i = 0; i < nvecs; i++ )
   3492              dist[i] = hal::normHamming(src1, src2 + step2*i, len);
   3493     }
   3494     else
   3495     {
   3496         int val0 = INT_MAX;
   3497         for( int i = 0; i < nvecs; i++ )
   3498         {
   3499             if (mask[i])
   3500                 dist[i] = hal::normHamming(src1, src2 + step2*i, len);
   3501             else
   3502                 dist[i] = val0;
   3503         }
   3504     }
   3505 }
   3506 
   3507 static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2,
   3508                               int nvecs, int len, int* dist, const uchar* mask)
   3509 {
   3510     step2 /= sizeof(src2[0]);
   3511     if( !mask )
   3512     {
   3513         for( int i = 0; i < nvecs; i++ )
   3514             dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
   3515     }
   3516     else
   3517     {
   3518         int val0 = INT_MAX;
   3519         for( int i = 0; i < nvecs; i++ )
   3520         {
   3521             if (mask[i])
   3522                 dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
   3523             else
   3524                 dist[i] = val0;
   3525         }
   3526     }
   3527 }
   3528 
   3529 static void batchDistL1_8u32s(const uchar* src1, const uchar* src2, size_t step2,
   3530                                int nvecs, int len, int* dist, const uchar* mask)
   3531 {
   3532     batchDistL1_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
   3533 }
   3534 
   3535 static void batchDistL1_8u32f(const uchar* src1, const uchar* src2, size_t step2,
   3536                                int nvecs, int len, float* dist, const uchar* mask)
   3537 {
   3538     batchDistL1_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
   3539 }
   3540 
   3541 static void batchDistL2Sqr_8u32s(const uchar* src1, const uchar* src2, size_t step2,
   3542                                   int nvecs, int len, int* dist, const uchar* mask)
   3543 {
   3544     batchDistL2Sqr_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
   3545 }
   3546 
   3547 static void batchDistL2Sqr_8u32f(const uchar* src1, const uchar* src2, size_t step2,
   3548                                   int nvecs, int len, float* dist, const uchar* mask)
   3549 {
   3550     batchDistL2Sqr_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
   3551 }
   3552 
   3553 static void batchDistL2_8u32f(const uchar* src1, const uchar* src2, size_t step2,
   3554                                int nvecs, int len, float* dist, const uchar* mask)
   3555 {
   3556     batchDistL2_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
   3557 }
   3558 
   3559 static void batchDistL1_32f(const float* src1, const float* src2, size_t step2,
   3560                              int nvecs, int len, float* dist, const uchar* mask)
   3561 {
   3562     batchDistL1_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
   3563 }
   3564 
   3565 static void batchDistL2Sqr_32f(const float* src1, const float* src2, size_t step2,
   3566                                 int nvecs, int len, float* dist, const uchar* mask)
   3567 {
   3568     batchDistL2Sqr_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
   3569 }
   3570 
   3571 static void batchDistL2_32f(const float* src1, const float* src2, size_t step2,
   3572                              int nvecs, int len, float* dist, const uchar* mask)
   3573 {
   3574     batchDistL2_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
   3575 }
   3576 
   3577 typedef void (*BatchDistFunc)(const uchar* src1, const uchar* src2, size_t step2,
   3578                               int nvecs, int len, uchar* dist, const uchar* mask);
   3579 
   3580 
   3581 struct BatchDistInvoker : public ParallelLoopBody
   3582 {
   3583     BatchDistInvoker( const Mat& _src1, const Mat& _src2,
   3584                       Mat& _dist, Mat& _nidx, int _K,
   3585                       const Mat& _mask, int _update,
   3586                       BatchDistFunc _func)
   3587     {
   3588         src1 = &_src1;
   3589         src2 = &_src2;
   3590         dist = &_dist;
   3591         nidx = &_nidx;
   3592         K = _K;
   3593         mask = &_mask;
   3594         update = _update;
   3595         func = _func;
   3596     }
   3597 
   3598     void operator()(const Range& range) const
   3599     {
   3600         AutoBuffer<int> buf(src2->rows);
   3601         int* bufptr = buf;
   3602 
   3603         for( int i = range.start; i < range.end; i++ )
   3604         {
   3605             func(src1->ptr(i), src2->ptr(), src2->step, src2->rows, src2->cols,
   3606                  K > 0 ? (uchar*)bufptr : dist->ptr(i), mask->data ? mask->ptr(i) : 0);
   3607 
   3608             if( K > 0 )
   3609             {
   3610                 int* nidxptr = nidx->ptr<int>(i);
   3611                 // since positive float's can be compared just like int's,
   3612                 // we handle both CV_32S and CV_32F cases with a single branch
   3613                 int* distptr = (int*)dist->ptr(i);
   3614 
   3615                 int j, k;
   3616 
   3617                 for( j = 0; j < src2->rows; j++ )
   3618                 {
   3619                     int d = bufptr[j];
   3620                     if( d < distptr[K-1] )
   3621                     {
   3622                         for( k = K-2; k >= 0 && distptr[k] > d; k-- )
   3623                         {
   3624                             nidxptr[k+1] = nidxptr[k];
   3625                             distptr[k+1] = distptr[k];
   3626                         }
   3627                         nidxptr[k+1] = j + update;
   3628                         distptr[k+1] = d;
   3629                     }
   3630                 }
   3631             }
   3632         }
   3633     }
   3634 
   3635     const Mat *src1;
   3636     const Mat *src2;
   3637     Mat *dist;
   3638     Mat *nidx;
   3639     const Mat *mask;
   3640     int K;
   3641     int update;
   3642     BatchDistFunc func;
   3643 };
   3644 
   3645 }
   3646 
   3647 void cv::batchDistance( InputArray _src1, InputArray _src2,
   3648                         OutputArray _dist, int dtype, OutputArray _nidx,
   3649                         int normType, int K, InputArray _mask,
   3650                         int update, bool crosscheck )
   3651 {
   3652     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
   3653     int type = src1.type();
   3654     CV_Assert( type == src2.type() && src1.cols == src2.cols &&
   3655                (type == CV_32F || type == CV_8U));
   3656     CV_Assert( _nidx.needed() == (K > 0) );
   3657 
   3658     if( dtype == -1 )
   3659     {
   3660         dtype = normType == NORM_HAMMING || normType == NORM_HAMMING2 ? CV_32S : CV_32F;
   3661     }
   3662     CV_Assert( (type == CV_8U && dtype == CV_32S) || dtype == CV_32F);
   3663 
   3664     K = std::min(K, src2.rows);
   3665 
   3666     _dist.create(src1.rows, (K > 0 ? K : src2.rows), dtype);
   3667     Mat dist = _dist.getMat(), nidx;
   3668     if( _nidx.needed() )
   3669     {
   3670         _nidx.create(dist.size(), CV_32S);
   3671         nidx = _nidx.getMat();
   3672     }
   3673 
   3674     if( update == 0 && K > 0 )
   3675     {
   3676         dist = Scalar::all(dtype == CV_32S ? (double)INT_MAX : (double)FLT_MAX);
   3677         nidx = Scalar::all(-1);
   3678     }
   3679 
   3680     if( crosscheck )
   3681     {
   3682         CV_Assert( K == 1 && update == 0 && mask.empty() );
   3683         Mat tdist, tidx;
   3684         batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false);
   3685 
   3686         // if an idx-th element from src1 appeared to be the nearest to i-th element of src2,
   3687         // we update the minimum mutual distance between idx-th element of src1 and the whole src2 set.
   3688         // As a result, if nidx[idx] = i*, it means that idx-th element of src1 is the nearest
   3689         // to i*-th element of src2 and i*-th element of src2 is the closest to idx-th element of src1.
   3690         // If nidx[idx] = -1, it means that there is no such ideal couple for it in src2.
   3691         // This O(N) procedure is called cross-check and it helps to eliminate some false matches.
   3692         if( dtype == CV_32S )
   3693         {
   3694             for( int i = 0; i < tdist.rows; i++ )
   3695             {
   3696                 int idx = tidx.at<int>(i);
   3697                 int d = tdist.at<int>(i), d0 = dist.at<int>(idx);
   3698                 if( d < d0 )
   3699                 {
   3700                     dist.at<int>(idx) = d;
   3701                     nidx.at<int>(idx) = i + update;
   3702                 }
   3703             }
   3704         }
   3705         else
   3706         {
   3707             for( int i = 0; i < tdist.rows; i++ )
   3708             {
   3709                 int idx = tidx.at<int>(i);
   3710                 float d = tdist.at<float>(i), d0 = dist.at<float>(idx);
   3711                 if( d < d0 )
   3712                 {
   3713                     dist.at<float>(idx) = d;
   3714                     nidx.at<int>(idx) = i + update;
   3715                 }
   3716             }
   3717         }
   3718         return;
   3719     }
   3720 
   3721     BatchDistFunc func = 0;
   3722     if( type == CV_8U )
   3723     {
   3724         if( normType == NORM_L1 && dtype == CV_32S )
   3725             func = (BatchDistFunc)batchDistL1_8u32s;
   3726         else if( normType == NORM_L1 && dtype == CV_32F )
   3727             func = (BatchDistFunc)batchDistL1_8u32f;
   3728         else if( normType == NORM_L2SQR && dtype == CV_32S )
   3729             func = (BatchDistFunc)batchDistL2Sqr_8u32s;
   3730         else if( normType == NORM_L2SQR && dtype == CV_32F )
   3731             func = (BatchDistFunc)batchDistL2Sqr_8u32f;
   3732         else if( normType == NORM_L2 && dtype == CV_32F )
   3733             func = (BatchDistFunc)batchDistL2_8u32f;
   3734         else if( normType == NORM_HAMMING && dtype == CV_32S )
   3735             func = (BatchDistFunc)batchDistHamming;
   3736         else if( normType == NORM_HAMMING2 && dtype == CV_32S )
   3737             func = (BatchDistFunc)batchDistHamming2;
   3738     }
   3739     else if( type == CV_32F && dtype == CV_32F )
   3740     {
   3741         if( normType == NORM_L1 )
   3742             func = (BatchDistFunc)batchDistL1_32f;
   3743         else if( normType == NORM_L2SQR )
   3744             func = (BatchDistFunc)batchDistL2Sqr_32f;
   3745         else if( normType == NORM_L2 )
   3746             func = (BatchDistFunc)batchDistL2_32f;
   3747     }
   3748 
   3749     if( func == 0 )
   3750         CV_Error_(CV_StsUnsupportedFormat,
   3751                   ("The combination of type=%d, dtype=%d and normType=%d is not supported",
   3752                    type, dtype, normType));
   3753 
   3754     parallel_for_(Range(0, src1.rows),
   3755                   BatchDistInvoker(src1, src2, dist, nidx, K, mask, update, func));
   3756 }
   3757 
   3758 
   3759 void cv::findNonZero( InputArray _src, OutputArray _idx )
   3760 {
   3761     Mat src = _src.getMat();
   3762     CV_Assert( src.type() == CV_8UC1 );
   3763     int n = countNonZero(src);
   3764     if( n == 0 )
   3765     {
   3766         _idx.release();
   3767         return;
   3768     }
   3769     if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
   3770         _idx.release();
   3771     _idx.create(n, 1, CV_32SC2);
   3772     Mat idx = _idx.getMat();
   3773     CV_Assert(idx.isContinuous());
   3774     Point* idx_ptr = idx.ptr<Point>();
   3775 
   3776     for( int i = 0; i < src.rows; i++ )
   3777     {
   3778         const uchar* bin_ptr = src.ptr(i);
   3779         for( int j = 0; j < src.cols; j++ )
   3780             if( bin_ptr[j] )
   3781                 *idx_ptr++ = Point(j, i);
   3782     }
   3783 }
   3784 
   3785 double cv::PSNR(InputArray _src1, InputArray _src2)
   3786 {
   3787     CV_Assert( _src1.depth() == CV_8U );
   3788     double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
   3789     return 20*log10(255./(diff+DBL_EPSILON));
   3790 }
   3791 
   3792 
   3793 CV_IMPL CvScalar cvSum( const CvArr* srcarr )
   3794 {
   3795     cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1));
   3796     if( CV_IS_IMAGE(srcarr) )
   3797     {
   3798         int coi = cvGetImageCOI((IplImage*)srcarr);
   3799         if( coi )
   3800         {
   3801             CV_Assert( 0 < coi && coi <= 4 );
   3802             sum = cv::Scalar(sum[coi-1]);
   3803         }
   3804     }
   3805     return sum;
   3806 }
   3807 
   3808 CV_IMPL int cvCountNonZero( const CvArr* imgarr )
   3809 {
   3810     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
   3811     if( img.channels() > 1 )
   3812         cv::extractImageCOI(imgarr, img);
   3813     return countNonZero(img);
   3814 }
   3815 
   3816 
   3817 CV_IMPL  CvScalar
   3818 cvAvg( const void* imgarr, const void* maskarr )
   3819 {
   3820     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
   3821     cv::Scalar mean = !maskarr ? cv::mean(img) : cv::mean(img, cv::cvarrToMat(maskarr));
   3822     if( CV_IS_IMAGE(imgarr) )
   3823     {
   3824         int coi = cvGetImageCOI((IplImage*)imgarr);
   3825         if( coi )
   3826         {
   3827             CV_Assert( 0 < coi && coi <= 4 );
   3828             mean = cv::Scalar(mean[coi-1]);
   3829         }
   3830     }
   3831     return mean;
   3832 }
   3833 
   3834 
   3835 CV_IMPL  void
   3836 cvAvgSdv( const CvArr* imgarr, CvScalar* _mean, CvScalar* _sdv, const void* maskarr )
   3837 {
   3838     cv::Scalar mean, sdv;
   3839 
   3840     cv::Mat mask;
   3841     if( maskarr )
   3842         mask = cv::cvarrToMat(maskarr);
   3843 
   3844     cv::meanStdDev(cv::cvarrToMat(imgarr, false, true, 1), mean, sdv, mask );
   3845 
   3846     if( CV_IS_IMAGE(imgarr) )
   3847     {
   3848         int coi = cvGetImageCOI((IplImage*)imgarr);
   3849         if( coi )
   3850         {
   3851             CV_Assert( 0 < coi && coi <= 4 );
   3852             mean = cv::Scalar(mean[coi-1]);
   3853             sdv = cv::Scalar(sdv[coi-1]);
   3854         }
   3855     }
   3856 
   3857     if( _mean )
   3858         *(cv::Scalar*)_mean = mean;
   3859     if( _sdv )
   3860         *(cv::Scalar*)_sdv = sdv;
   3861 }
   3862 
   3863 
   3864 CV_IMPL void
   3865 cvMinMaxLoc( const void* imgarr, double* _minVal, double* _maxVal,
   3866              CvPoint* _minLoc, CvPoint* _maxLoc, const void* maskarr )
   3867 {
   3868     cv::Mat mask, img = cv::cvarrToMat(imgarr, false, true, 1);
   3869     if( maskarr )
   3870         mask = cv::cvarrToMat(maskarr);
   3871     if( img.channels() > 1 )
   3872         cv::extractImageCOI(imgarr, img);
   3873 
   3874     cv::minMaxLoc( img, _minVal, _maxVal,
   3875                    (cv::Point*)_minLoc, (cv::Point*)_maxLoc, mask );
   3876 }
   3877 
   3878 
   3879 CV_IMPL  double
   3880 cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
   3881 {
   3882     cv::Mat a, mask;
   3883     if( !imgA )
   3884     {
   3885         imgA = imgB;
   3886         imgB = 0;
   3887     }
   3888 
   3889     a = cv::cvarrToMat(imgA, false, true, 1);
   3890     if( maskarr )
   3891         mask = cv::cvarrToMat(maskarr);
   3892 
   3893     if( a.channels() > 1 && CV_IS_IMAGE(imgA) && cvGetImageCOI((const IplImage*)imgA) > 0 )
   3894         cv::extractImageCOI(imgA, a);
   3895 
   3896     if( !imgB )
   3897         return !maskarr ? cv::norm(a, normType) : cv::norm(a, normType, mask);
   3898 
   3899     cv::Mat b = cv::cvarrToMat(imgB, false, true, 1);
   3900     if( b.channels() > 1 && CV_IS_IMAGE(imgB) && cvGetImageCOI((const IplImage*)imgB) > 0 )
   3901         cv::extractImageCOI(imgB, b);
   3902 
   3903     return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
   3904 }
   3905