Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 /* ////////////////////////////////////////////////////////////////////
     45 //
     46 //  Arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ...
     47 //
     48 // */
     49 
     50 #include "precomp.hpp"
     51 #include "opencl_kernels_core.hpp"
     52 
     53 namespace cv
     54 {
     55 
     56 struct NOP {};
     57 
     58 #if CV_SSE2 || CV_NEON
     59 
     60 #define FUNCTOR_TEMPLATE(name)          \
     61     template<typename T> struct name {}
     62 
     63 FUNCTOR_TEMPLATE(VLoadStore128);
     64 #if CV_SSE2
     65 FUNCTOR_TEMPLATE(VLoadStore64);
     66 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
     67 #if CV_AVX2
     68 FUNCTOR_TEMPLATE(VLoadStore256);
     69 FUNCTOR_TEMPLATE(VLoadStore256Aligned);
     70 #endif
     71 #endif
     72 
     73 #endif
     74 
     75 template<typename T, class Op, class VOp>
     76 void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
     77 {
     78 #if CV_SSE2 || CV_NEON
     79     VOp vop;
     80 #endif
     81     Op op;
     82 
     83     for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
     84                         src2 = (const T *)((const uchar *)src2 + step2),
     85                         dst = (T *)((uchar *)dst + step) )
     86     {
     87         int x = 0;
     88 
     89 #if CV_NEON || CV_SSE2
     90 #if CV_AVX2
     91         if( USE_AVX2 )
     92         {
     93             for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
     94             {
     95                 typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
     96                 r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
     97                 VLoadStore256<T>::store(dst + x, r0);
     98             }
     99         }
    100 #else
    101 #if CV_SSE2
    102         if( USE_SSE2 )
    103         {
    104 #endif // CV_SSE2
    105             for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
    106             {
    107                 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
    108                 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
    109                 r0 = vop(r0, VLoadStore128<T>::load(src2 + x               ));
    110                 r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
    111                 VLoadStore128<T>::store(dst + x               , r0);
    112                 VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
    113             }
    114 #if CV_SSE2
    115         }
    116 #endif // CV_SSE2
    117 #endif // CV_AVX2
    118 #endif // CV_NEON || CV_SSE2
    119 
    120 #if CV_AVX2
    121         // nothing
    122 #elif CV_SSE2
    123         if( USE_SSE2 )
    124         {
    125             for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) )
    126             {
    127                 typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
    128                 r = vop(r, VLoadStore64<T>::load(src2 + x));
    129                 VLoadStore64<T>::store(dst + x, r);
    130             }
    131         }
    132 #endif
    133 
    134 #if CV_ENABLE_UNROLLED
    135         for( ; x <= sz.width - 4; x += 4 )
    136         {
    137             T v0 = op(src1[x], src2[x]);
    138             T v1 = op(src1[x+1], src2[x+1]);
    139             dst[x] = v0; dst[x+1] = v1;
    140             v0 = op(src1[x+2], src2[x+2]);
    141             v1 = op(src1[x+3], src2[x+3]);
    142             dst[x+2] = v0; dst[x+3] = v1;
    143         }
    144 #endif
    145 
    146         for( ; x < sz.width; x++ )
    147             dst[x] = op(src1[x], src2[x]);
    148     }
    149 }
    150 
    151 template<typename T, class Op, class Op32>
    152 void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
    153               T* dst, size_t step, Size sz)
    154 {
    155 #if CV_SSE2 || CV_NEON
    156     Op32 op32;
    157 #endif
    158     Op op;
    159 
    160     for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
    161                         src2 = (const T *)((const uchar *)src2 + step2),
    162                         dst = (T *)((uchar *)dst + step) )
    163     {
    164         int x = 0;
    165 
    166 #if CV_AVX2
    167         if( USE_AVX2 )
    168         {
    169             if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
    170             {
    171                 for( ; x <= sz.width - 8; x += 8 )
    172                 {
    173                     typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
    174                     r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
    175                     VLoadStore256Aligned<T>::store(dst + x, r0);
    176                 }
    177             }
    178         }
    179 #elif CV_SSE2
    180         if( USE_SSE2 )
    181         {
    182             if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
    183             {
    184                 for( ; x <= sz.width - 8; x += 8 )
    185                 {
    186                     typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
    187                     typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
    188                     r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
    189                     r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
    190                     VLoadStore128Aligned<T>::store(dst + x    , r0);
    191                     VLoadStore128Aligned<T>::store(dst + x + 4, r1);
    192                 }
    193             }
    194         }
    195 #endif // CV_AVX2
    196 
    197 #if CV_NEON || CV_SSE2
    198 #if CV_AVX2
    199         if( USE_AVX2 )
    200         {
    201             for( ; x <= sz.width - 8; x += 8 )
    202             {
    203                 typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
    204                 r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
    205                 VLoadStore256<T>::store(dst + x, r0);
    206             }
    207         }
    208 #else
    209 #if CV_SSE2
    210         if( USE_SSE2 )
    211         {
    212 #endif // CV_SSE2
    213             for( ; x <= sz.width - 8; x += 8 )
    214             {
    215                 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
    216                 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
    217                 r0 = op32(r0, VLoadStore128<T>::load(src2 + x    ));
    218                 r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
    219                 VLoadStore128<T>::store(dst + x    , r0);
    220                 VLoadStore128<T>::store(dst + x + 4, r1);
    221             }
    222 #if CV_SSE2
    223         }
    224 #endif // CV_SSE2
    225 #endif // CV_AVX2
    226 #endif // CV_NEON || CV_SSE2
    227 
    228 #if CV_ENABLE_UNROLLED
    229         for( ; x <= sz.width - 4; x += 4 )
    230         {
    231             T v0 = op(src1[x], src2[x]);
    232             T v1 = op(src1[x+1], src2[x+1]);
    233             dst[x] = v0; dst[x+1] = v1;
    234             v0 = op(src1[x+2], src2[x+2]);
    235             v1 = op(src1[x+3], src2[x+3]);
    236             dst[x+2] = v0; dst[x+3] = v1;
    237         }
    238 #endif
    239 
    240         for( ; x < sz.width; x++ )
    241             dst[x] = op(src1[x], src2[x]);
    242     }
    243 }
    244 
    245 
    246 template<typename T, class Op, class Op64>
    247 void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
    248                T* dst, size_t step, Size sz)
    249 {
    250 #if CV_SSE2
    251     Op64 op64;
    252 #endif
    253     Op op;
    254 
    255     for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
    256                         src2 = (const T *)((const uchar *)src2 + step2),
    257                         dst = (T *)((uchar *)dst + step) )
    258     {
    259         int x = 0;
    260 
    261 #if CV_AVX2
    262         if( USE_AVX2 )
    263         {
    264             if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
    265             {
    266                 for( ; x <= sz.width - 4; x += 4 )
    267                 {
    268                     typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
    269                     r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
    270                     VLoadStore256Aligned<T>::store(dst + x, r0);
    271                 }
    272             }
    273         }
    274 #elif CV_SSE2
    275         if( USE_SSE2 )
    276         {
    277             if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
    278             {
    279                 for( ; x <= sz.width - 4; x += 4 )
    280                 {
    281                     typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
    282                     typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
    283                     r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
    284                     r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
    285                     VLoadStore128Aligned<T>::store(dst + x    , r0);
    286                     VLoadStore128Aligned<T>::store(dst + x + 2, r1);
    287                 }
    288             }
    289         }
    290 #endif
    291 
    292         for( ; x <= sz.width - 4; x += 4 )
    293         {
    294             T v0 = op(src1[x], src2[x]);
    295             T v1 = op(src1[x+1], src2[x+1]);
    296             dst[x] = v0; dst[x+1] = v1;
    297             v0 = op(src1[x+2], src2[x+2]);
    298             v1 = op(src1[x+3], src2[x+3]);
    299             dst[x+2] = v0; dst[x+3] = v1;
    300         }
    301 
    302         for( ; x < sz.width; x++ )
    303             dst[x] = op(src1[x], src2[x]);
    304     }
    305 }
    306 
    307 #if CV_AVX2
    308 
    309 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
    310     template <>                                                                                  \
    311     struct name<template_arg>{                                                                   \
    312         typedef register_type reg_type;                                                          \
    313         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
    314         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
    315     }
    316 
    317 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
    318     template <>                                                                     \
    319     struct name<template_arg>{                                                      \
    320         typedef register_type reg_type;                                             \
    321         static reg_type load(const template_arg * p) { return load_body (p); }      \
    322         static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
    323     }
    324 
    325 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
    326     template<>                                                                 \
    327     struct name<template_arg>                                                  \
    328     {                                                                          \
    329         VLoadStore256<template_arg>::reg_type operator()(                      \
    330                         const VLoadStore256<template_arg>::reg_type & a,       \
    331                         const VLoadStore256<template_arg>::reg_type & b) const \
    332         {                                                                      \
    333             body;                                                              \
    334         }                                                                      \
    335     }
    336 
    337 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
    338     template<>                                                                 \
    339     struct name<template_arg>                                                  \
    340     {                                                                          \
    341         VLoadStore256<template_arg>::reg_type operator()(                      \
    342                         const VLoadStore256<template_arg>::reg_type & a,       \
    343                         const VLoadStore256<template_arg>::reg_type &  ) const \
    344         {                                                                      \
    345             body;                                                              \
    346         }                                                                      \
    347     }
    348 
    349 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
    350 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
    351 FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
    352 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
    353 FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
    354 FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
    355 FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
    356 
    357 FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
    358 FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
    359 FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
    360 
    361 FUNCTOR_TEMPLATE(VAdd);
    362 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
    363 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
    364 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
    365 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
    366 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
    367 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
    368 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
    369 
    370 FUNCTOR_TEMPLATE(VSub);
    371 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
    372 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
    373 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
    374 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
    375 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
    376 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
    377 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
    378 
    379 FUNCTOR_TEMPLATE(VMin);
    380 FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
    381 FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
    382 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
    383 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
    384 FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
    385 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
    386 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
    387 
    388 FUNCTOR_TEMPLATE(VMax);
    389 FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
    390 FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
    391 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
    392 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
    393 FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
    394 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
    395 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
    396 
    397 
    398 static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
    399                                                            0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
    400 static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
    401                                                            0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
    402 
    403 FUNCTOR_TEMPLATE(VAbsDiff);
    404 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
    405         return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
    406     );
    407 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
    408         __m256i d = _mm256_subs_epi8(a, b);
    409         __m256i m = _mm256_cmpgt_epi8(b, a);
    410         return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
    411     );
    412 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
    413         return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
    414     );
    415 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
    416         __m256i M = _mm256_max_epi16(a, b);
    417         __m256i m = _mm256_min_epi16(a, b);
    418         return _mm256_subs_epi16(M, m);
    419     );
    420 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
    421         __m256i d = _mm256_sub_epi32(a, b);
    422         __m256i m = _mm256_cmpgt_epi32(b, a);
    423         return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
    424     );
    425 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
    426         return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
    427     );
    428 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
    429         return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
    430     );
    431 
    432 FUNCTOR_TEMPLATE(VAnd);
    433 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
    434 FUNCTOR_TEMPLATE(VOr);
    435 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
    436 FUNCTOR_TEMPLATE(VXor);
    437 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
    438 FUNCTOR_TEMPLATE(VNot);
    439 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
    440 
    441 #elif CV_SSE2
    442 
    443 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
    444     template <>                                                                                  \
    445     struct name<template_arg>{                                                                   \
    446         typedef register_type reg_type;                                                          \
    447         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
    448         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
    449     }
    450 
    451 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
    452     template <>                                                                \
    453     struct name<template_arg>{                                                 \
    454         typedef register_type reg_type;                                        \
    455         static reg_type load(const template_arg * p) { return load_body (p); } \
    456         static void store(template_arg * p, reg_type v) { store_body (p, v); } \
    457     }
    458 
    459 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
    460     template<>                                                                 \
    461     struct name<template_arg>                                                  \
    462     {                                                                          \
    463         VLoadStore128<template_arg>::reg_type operator()(                      \
    464                         const VLoadStore128<template_arg>::reg_type & a,       \
    465                         const VLoadStore128<template_arg>::reg_type & b) const \
    466         {                                                                      \
    467             body;                                                              \
    468         }                                                                      \
    469     }
    470 
    471 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
    472     template<>                                                                 \
    473     struct name<template_arg>                                                  \
    474     {                                                                          \
    475         VLoadStore128<template_arg>::reg_type operator()(                      \
    476                         const VLoadStore128<template_arg>::reg_type & a,       \
    477                         const VLoadStore128<template_arg>::reg_type &  ) const \
    478         {                                                                      \
    479             body;                                                              \
    480         }                                                                      \
    481     }
    482 
    483 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
    484 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
    485 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
    486 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
    487 FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
    488 FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
    489 FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
    490 
    491 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
    492 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
    493 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
    494 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
    495 
    496 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
    497 FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
    498 FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
    499 
    500 FUNCTOR_TEMPLATE(VAdd);
    501 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
    502 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
    503 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
    504 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
    505 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
    506 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
    507 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
    508 
    509 FUNCTOR_TEMPLATE(VSub);
    510 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
    511 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
    512 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
    513 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
    514 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
    515 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
    516 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
    517 
    518 FUNCTOR_TEMPLATE(VMin);
    519 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
    520 FUNCTOR_CLOSURE_2arg(VMin, schar,
    521         __m128i m = _mm_cmpgt_epi8(a, b);
    522         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
    523     );
    524 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
    525 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
    526 FUNCTOR_CLOSURE_2arg(VMin,    int,
    527         __m128i m = _mm_cmpgt_epi32(a, b);
    528         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
    529     );
    530 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
    531 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
    532 
    533 FUNCTOR_TEMPLATE(VMax);
    534 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
    535 FUNCTOR_CLOSURE_2arg(VMax, schar,
    536         __m128i m = _mm_cmpgt_epi8(b, a);
    537         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
    538     );
    539 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
    540 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
    541 FUNCTOR_CLOSURE_2arg(VMax,    int,
    542         __m128i m = _mm_cmpgt_epi32(b, a);
    543         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
    544     );
    545 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
    546 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
    547 
    548 
    549 static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
    550 static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
    551 
    552 FUNCTOR_TEMPLATE(VAbsDiff);
    553 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
    554         return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
    555     );
    556 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
    557         __m128i d = _mm_subs_epi8(a, b);
    558         __m128i m = _mm_cmpgt_epi8(b, a);
    559         return _mm_subs_epi8(_mm_xor_si128(d, m), m);
    560     );
    561 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
    562         return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
    563     );
    564 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
    565         __m128i M = _mm_max_epi16(a, b);
    566         __m128i m = _mm_min_epi16(a, b);
    567         return _mm_subs_epi16(M, m);
    568     );
    569 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
    570         __m128i d = _mm_sub_epi32(a, b);
    571         __m128i m = _mm_cmpgt_epi32(b, a);
    572         return _mm_sub_epi32(_mm_xor_si128(d, m), m);
    573     );
    574 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
    575         return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
    576     );
    577 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
    578         return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
    579     );
    580 
    581 FUNCTOR_TEMPLATE(VAnd);
    582 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
    583 FUNCTOR_TEMPLATE(VOr);
    584 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
    585 FUNCTOR_TEMPLATE(VXor);
    586 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
    587 FUNCTOR_TEMPLATE(VNot);
    588 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
    589 #endif
    590 
    591 #if CV_NEON
    592 
    593 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
    594     template <>                                                                \
    595     struct name<template_arg>{                                                 \
    596         typedef register_type reg_type;                                        \
    597         static reg_type load(const template_arg * p) { return load_body (p);}; \
    598         static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
    599     }
    600 
    601 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
    602     template<>                                                         \
    603     struct name<template_arg>                                          \
    604     {                                                                  \
    605         VLoadStore128<template_arg>::reg_type operator()(              \
    606                         VLoadStore128<template_arg>::reg_type a,       \
    607                         VLoadStore128<template_arg>::reg_type b) const \
    608         {                                                              \
    609             return body;                                               \
    610         };                                                             \
    611     }
    612 
    613 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
    614     template<>                                                         \
    615     struct name<template_arg>                                          \
    616     {                                                                  \
    617         VLoadStore128<template_arg>::reg_type operator()(              \
    618                         VLoadStore128<template_arg>::reg_type a,       \
    619                         VLoadStore128<template_arg>::reg_type  ) const \
    620         {                                                              \
    621             return body;                                               \
    622         };                                                             \
    623     }
    624 
    625 FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
    626 FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
    627 FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
    628 FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
    629 FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
    630 FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
    631 
    632 FUNCTOR_TEMPLATE(VAdd);
    633 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
    634 FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
    635 FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
    636 FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
    637 FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
    638 FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
    639 
    640 FUNCTOR_TEMPLATE(VSub);
    641 FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
    642 FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
    643 FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
    644 FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
    645 FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
    646 FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
    647 
    648 FUNCTOR_TEMPLATE(VMin);
    649 FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
    650 FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
    651 FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
    652 FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
    653 FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
    654 FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
    655 
    656 FUNCTOR_TEMPLATE(VMax);
    657 FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
    658 FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
    659 FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
    660 FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
    661 FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
    662 FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
    663 
    664 FUNCTOR_TEMPLATE(VAbsDiff);
    665 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
    666 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
    667 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
    668 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
    669 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
    670 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
    671 
    672 FUNCTOR_TEMPLATE(VAnd);
    673 FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
    674 FUNCTOR_TEMPLATE(VOr);
    675 FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
    676 FUNCTOR_TEMPLATE(VXor);
    677 FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
    678 FUNCTOR_TEMPLATE(VNot);
    679 FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
    680 #endif
    681 
    682 #if CV_SSE2 || CV_NEON
    683 #define IF_SIMD(op) op
    684 #else
    685 #define IF_SIMD(op) NOP
    686 #endif
    687 
    688 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
    689 { return CV_FAST_CAST_8U(a + b); }
    690 template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
    691 { return CV_FAST_CAST_8U(a - b); }
    692 
    693 template<typename T> struct OpAbsDiff
    694 {
    695     typedef T type1;
    696     typedef T type2;
    697     typedef T rtype;
    698     T operator()(T a, T b) const { return (T)std::abs(a - b); }
    699 };
    700 
    701 template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
    702 { return saturate_cast<short>(std::abs(a - b)); }
    703 
    704 template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
    705 { return saturate_cast<schar>(std::abs(a - b)); }
    706 
    707 template<typename T, typename WT=T> struct OpAbsDiffS
    708 {
    709     typedef T type1;
    710     typedef WT type2;
    711     typedef T rtype;
    712     T operator()(T a, WT b) const { return saturate_cast<T>(std::abs(a - b)); }
    713 };
    714 
    715 template<typename T> struct OpAnd
    716 {
    717     typedef T type1;
    718     typedef T type2;
    719     typedef T rtype;
    720     T operator()( T a, T b ) const { return a & b; }
    721 };
    722 
    723 template<typename T> struct OpOr
    724 {
    725     typedef T type1;
    726     typedef T type2;
    727     typedef T rtype;
    728     T operator()( T a, T b ) const { return a | b; }
    729 };
    730 
    731 template<typename T> struct OpXor
    732 {
    733     typedef T type1;
    734     typedef T type2;
    735     typedef T rtype;
    736     T operator()( T a, T b ) const { return a ^ b; }
    737 };
    738 
    739 template<typename T> struct OpNot
    740 {
    741     typedef T type1;
    742     typedef T type2;
    743     typedef T rtype;
    744     T operator()( T a, T ) const { return ~a; }
    745 };
    746 
    747 #if (ARITHM_USE_IPP == 1)
    748 static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
    749 {
    750     if( sz.height == 1 )
    751         step1 = step2 = step = sz.width*elemSize;
    752 }
    753 #endif
    754 
    755 static void add8u( const uchar* src1, size_t step1,
    756                    const uchar* src2, size_t step2,
    757                    uchar* dst, size_t step, Size sz, void* )
    758 {
    759 #if (ARITHM_USE_IPP == 1)
    760     CV_IPP_CHECK()
    761     {
    762         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    763         if (0 <= ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
    764         {
    765             CV_IMPL_ADD(CV_IMPL_IPP);
    766             return;
    767         }
    768         setIppErrorStatus();
    769     }
    770 #endif
    771     (vBinOp<uchar, OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
    772 }
    773 
    774 static void add8s( const schar* src1, size_t step1,
    775                    const schar* src2, size_t step2,
    776                    schar* dst, size_t step, Size sz, void* )
    777 {
    778     vBinOp<schar, OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, sz);
    779 }
    780 
    781 static void add16u( const ushort* src1, size_t step1,
    782                     const ushort* src2, size_t step2,
    783                     ushort* dst, size_t step, Size sz, void* )
    784 {
    785 #if (ARITHM_USE_IPP == 1)
    786     CV_IPP_CHECK()
    787     {
    788         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    789         if (0 <= ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
    790         {
    791             CV_IMPL_ADD(CV_IMPL_IPP);
    792             return;
    793         }
    794         setIppErrorStatus();
    795     }
    796 #endif
    797     (vBinOp<ushort, OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, sz));
    798 }
    799 
    800 static void add16s( const short* src1, size_t step1,
    801                     const short* src2, size_t step2,
    802                     short* dst, size_t step, Size sz, void* )
    803 {
    804 #if (ARITHM_USE_IPP == 1)
    805     CV_IPP_CHECK()
    806     {
    807         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    808         if (0 <= ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
    809         {
    810             CV_IMPL_ADD(CV_IMPL_IPP);
    811             return;
    812         }
    813         setIppErrorStatus();
    814     }
    815 #endif
    816     (vBinOp<short, OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, sz));
    817 }
    818 
    819 static void add32s( const int* src1, size_t step1,
    820                     const int* src2, size_t step2,
    821                     int* dst, size_t step, Size sz, void* )
    822 {
    823     vBinOp32<int, OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, sz);
    824 }
    825 
    826 static void add32f( const float* src1, size_t step1,
    827                     const float* src2, size_t step2,
    828                     float* dst, size_t step, Size sz, void* )
    829 {
    830 #if (ARITHM_USE_IPP == 1)
    831     CV_IPP_CHECK()
    832     {
    833         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    834         if (0 <= ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
    835         {
    836             CV_IMPL_ADD(CV_IMPL_IPP);
    837             return;
    838         }
    839         setIppErrorStatus();
    840     }
    841 #endif
    842     (vBinOp32<float, OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, sz));
    843 }
    844 
    845 static void add64f( const double* src1, size_t step1,
    846                     const double* src2, size_t step2,
    847                     double* dst, size_t step, Size sz, void* )
    848 {
    849     vBinOp64<double, OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, sz);
    850 }
    851 
    852 static void sub8u( const uchar* src1, size_t step1,
    853                    const uchar* src2, size_t step2,
    854                    uchar* dst, size_t step, Size sz, void* )
    855 {
    856 #if (ARITHM_USE_IPP == 1)
    857     CV_IPP_CHECK()
    858     {
    859         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    860         if (0 <= ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
    861         {
    862             CV_IMPL_ADD(CV_IMPL_IPP);
    863             return;
    864         }
    865         setIppErrorStatus();
    866     }
    867 #endif
    868     (vBinOp<uchar, OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, sz));
    869 }
    870 
    871 static void sub8s( const schar* src1, size_t step1,
    872                    const schar* src2, size_t step2,
    873                    schar* dst, size_t step, Size sz, void* )
    874 {
    875     vBinOp<schar, OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, sz);
    876 }
    877 
    878 static void sub16u( const ushort* src1, size_t step1,
    879                     const ushort* src2, size_t step2,
    880                     ushort* dst, size_t step, Size sz, void* )
    881 {
    882 #if (ARITHM_USE_IPP == 1)
    883     CV_IPP_CHECK()
    884     {
    885         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    886         if (0 <= ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
    887         {
    888             CV_IMPL_ADD(CV_IMPL_IPP);
    889             return;
    890         }
    891         setIppErrorStatus();
    892     }
    893 #endif
    894     (vBinOp<ushort, OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, sz));
    895 }
    896 
    897 static void sub16s( const short* src1, size_t step1,
    898                     const short* src2, size_t step2,
    899                     short* dst, size_t step, Size sz, void* )
    900 {
    901 #if (ARITHM_USE_IPP == 1)
    902     CV_IPP_CHECK()
    903     {
    904         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    905         if (0 <= ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
    906         {
    907             CV_IMPL_ADD(CV_IMPL_IPP);
    908             return;
    909         }
    910         setIppErrorStatus();
    911     }
    912 #endif
    913     (vBinOp<short, OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, sz));
    914 }
    915 
    916 static void sub32s( const int* src1, size_t step1,
    917                     const int* src2, size_t step2,
    918                     int* dst, size_t step, Size sz, void* )
    919 {
    920     vBinOp32<int, OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, sz);
    921 }
    922 
    923 static void sub32f( const float* src1, size_t step1,
    924                    const float* src2, size_t step2,
    925                    float* dst, size_t step, Size sz, void* )
    926 {
    927 #if (ARITHM_USE_IPP == 1)
    928     CV_IPP_CHECK()
    929     {
    930         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    931         if (0 <= ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz)))
    932         {
    933             CV_IMPL_ADD(CV_IMPL_IPP);
    934             return;
    935         }
    936         setIppErrorStatus();
    937     }
    938 #endif
    939     (vBinOp32<float, OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, sz));
    940 }
    941 
    942 static void sub64f( const double* src1, size_t step1,
    943                     const double* src2, size_t step2,
    944                     double* dst, size_t step, Size sz, void* )
    945 {
    946     vBinOp64<double, OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, sz);
    947 }
    948 
    949 template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
    950 template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
    951 
    952 static void max8u( const uchar* src1, size_t step1,
    953                    const uchar* src2, size_t step2,
    954                    uchar* dst, size_t step, Size sz, void* )
    955 {
    956 #if (ARITHM_USE_IPP == 1)
    957     CV_IPP_CHECK()
    958     {
    959         uchar* s1 = (uchar*)src1;
    960         uchar* s2 = (uchar*)src2;
    961         uchar* d  = dst;
    962         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    963         int i = 0;
    964         for(; i < sz.height; i++)
    965         {
    966             if (0 > ippsMaxEvery_8u(s1, s2, d, sz.width))
    967                 break;
    968             s1 += step1;
    969             s2 += step2;
    970             d  += step;
    971         }
    972         if (i == sz.height)
    973         {
    974             CV_IMPL_ADD(CV_IMPL_IPP);
    975             return;
    976         }
    977         setIppErrorStatus();
    978     }
    979 #endif
    980     vBinOp<uchar, OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, sz);
    981 }
    982 
    983 static void max8s( const schar* src1, size_t step1,
    984                    const schar* src2, size_t step2,
    985                    schar* dst, size_t step, Size sz, void* )
    986 {
    987     vBinOp<schar, OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, sz);
    988 }
    989 
    990 static void max16u( const ushort* src1, size_t step1,
    991                     const ushort* src2, size_t step2,
    992                     ushort* dst, size_t step, Size sz, void* )
    993 {
    994 #if (ARITHM_USE_IPP == 1)
    995     CV_IPP_CHECK()
    996     {
    997         ushort* s1 = (ushort*)src1;
    998         ushort* s2 = (ushort*)src2;
    999         ushort* d  = dst;
   1000         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1001         int i = 0;
   1002         for(; i < sz.height; i++)
   1003         {
   1004             if (0 > ippsMaxEvery_16u(s1, s2, d, sz.width))
   1005                 break;
   1006             s1 = (ushort*)((uchar*)s1 + step1);
   1007             s2 = (ushort*)((uchar*)s2 + step2);
   1008             d  = (ushort*)((uchar*)d + step);
   1009         }
   1010         if (i == sz.height)
   1011         {
   1012             CV_IMPL_ADD(CV_IMPL_IPP);
   1013             return;
   1014         }
   1015         setIppErrorStatus();
   1016     }
   1017 #endif
   1018     vBinOp<ushort, OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, sz);
   1019 }
   1020 
   1021 static void max16s( const short* src1, size_t step1,
   1022                     const short* src2, size_t step2,
   1023                     short* dst, size_t step, Size sz, void* )
   1024 {
   1025     vBinOp<short, OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, sz);
   1026 }
   1027 
   1028 static void max32s( const int* src1, size_t step1,
   1029                     const int* src2, size_t step2,
   1030                     int* dst, size_t step, Size sz, void* )
   1031 {
   1032     vBinOp32<int, OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, sz);
   1033 }
   1034 
   1035 static void max32f( const float* src1, size_t step1,
   1036                     const float* src2, size_t step2,
   1037                     float* dst, size_t step, Size sz, void* )
   1038 {
   1039 #if (ARITHM_USE_IPP == 1)
   1040     CV_IPP_CHECK()
   1041     {
   1042         float* s1 = (float*)src1;
   1043         float* s2 = (float*)src2;
   1044         float* d  = dst;
   1045         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1046         int i = 0;
   1047         for(; i < sz.height; i++)
   1048         {
   1049             if (0 > ippsMaxEvery_32f(s1, s2, d, sz.width))
   1050                 break;
   1051             s1 = (float*)((uchar*)s1 + step1);
   1052             s2 = (float*)((uchar*)s2 + step2);
   1053             d  = (float*)((uchar*)d + step);
   1054         }
   1055         if (i == sz.height)
   1056         {
   1057             CV_IMPL_ADD(CV_IMPL_IPP);
   1058             return;
   1059         }
   1060         setIppErrorStatus();
   1061     }
   1062 #endif
   1063     vBinOp32<float, OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, sz);
   1064 }
   1065 
   1066 static void max64f( const double* src1, size_t step1,
   1067                     const double* src2, size_t step2,
   1068                     double* dst, size_t step, Size sz, void* )
   1069 {
   1070 #if ARITHM_USE_IPP == 1
   1071     CV_IPP_CHECK()
   1072     {
   1073         double* s1 = (double*)src1;
   1074         double* s2 = (double*)src2;
   1075         double* d  = dst;
   1076         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1077         int i = 0;
   1078         for(; i < sz.height; i++)
   1079         {
   1080             if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width))
   1081                 break;
   1082             s1 = (double*)((uchar*)s1 + step1);
   1083             s2 = (double*)((uchar*)s2 + step2);
   1084             d  = (double*)((uchar*)d + step);
   1085         }
   1086         if (i == sz.height)
   1087         {
   1088             CV_IMPL_ADD(CV_IMPL_IPP);
   1089             return;
   1090         }
   1091         setIppErrorStatus();
   1092     }
   1093 #endif
   1094     vBinOp64<double, OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, sz);
   1095 }
   1096 
   1097 static void min8u( const uchar* src1, size_t step1,
   1098                    const uchar* src2, size_t step2,
   1099                    uchar* dst, size_t step, Size sz, void* )
   1100 {
   1101 #if (ARITHM_USE_IPP == 1)
   1102     CV_IPP_CHECK()
   1103     {
   1104         uchar* s1 = (uchar*)src1;
   1105         uchar* s2 = (uchar*)src2;
   1106         uchar* d  = dst;
   1107         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1108         int i = 0;
   1109         for(; i < sz.height; i++)
   1110         {
   1111             if (0 > ippsMinEvery_8u(s1, s2, d, sz.width))
   1112                 break;
   1113             s1 += step1;
   1114             s2 += step2;
   1115             d  += step;
   1116         }
   1117         if (i == sz.height)
   1118         {
   1119             CV_IMPL_ADD(CV_IMPL_IPP);
   1120             return;
   1121         }
   1122         setIppErrorStatus();
   1123     }
   1124 #endif
   1125     vBinOp<uchar, OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, sz);
   1126 }
   1127 
   1128 static void min8s( const schar* src1, size_t step1,
   1129                    const schar* src2, size_t step2,
   1130                    schar* dst, size_t step, Size sz, void* )
   1131 {
   1132     vBinOp<schar, OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, sz);
   1133 }
   1134 
   1135 static void min16u( const ushort* src1, size_t step1,
   1136                     const ushort* src2, size_t step2,
   1137                     ushort* dst, size_t step, Size sz, void* )
   1138 {
   1139 #if (ARITHM_USE_IPP == 1)
   1140     CV_IPP_CHECK()
   1141     {
   1142         ushort* s1 = (ushort*)src1;
   1143         ushort* s2 = (ushort*)src2;
   1144         ushort* d  = dst;
   1145         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1146         int i = 0;
   1147         for(; i < sz.height; i++)
   1148         {
   1149             if (0 > ippsMinEvery_16u(s1, s2, d, sz.width))
   1150                 break;
   1151             s1 = (ushort*)((uchar*)s1 + step1);
   1152             s2 = (ushort*)((uchar*)s2 + step2);
   1153             d  = (ushort*)((uchar*)d + step);
   1154         }
   1155         if (i == sz.height)
   1156         {
   1157             CV_IMPL_ADD(CV_IMPL_IPP);
   1158             return;
   1159         }
   1160         setIppErrorStatus();
   1161     }
   1162 #endif
   1163     vBinOp<ushort, OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, sz);
   1164 }
   1165 
   1166 static void min16s( const short* src1, size_t step1,
   1167                     const short* src2, size_t step2,
   1168                     short* dst, size_t step, Size sz, void* )
   1169 {
   1170     vBinOp<short, OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, sz);
   1171 }
   1172 
   1173 static void min32s( const int* src1, size_t step1,
   1174                     const int* src2, size_t step2,
   1175                     int* dst, size_t step, Size sz, void* )
   1176 {
   1177     vBinOp32<int, OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, sz);
   1178 }
   1179 
   1180 static void min32f( const float* src1, size_t step1,
   1181                     const float* src2, size_t step2,
   1182                     float* dst, size_t step, Size sz, void* )
   1183 {
   1184 #if (ARITHM_USE_IPP == 1)
   1185     CV_IPP_CHECK()
   1186     {
   1187         float* s1 = (float*)src1;
   1188         float* s2 = (float*)src2;
   1189         float* d  = dst;
   1190         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1191         int i = 0;
   1192         for(; i < sz.height; i++)
   1193         {
   1194             if (0 > ippsMinEvery_32f(s1, s2, d, sz.width))
   1195                 break;
   1196             s1 = (float*)((uchar*)s1 + step1);
   1197             s2 = (float*)((uchar*)s2 + step2);
   1198             d  = (float*)((uchar*)d + step);
   1199         }
   1200         if (i == sz.height)
   1201         {
   1202             CV_IMPL_ADD(CV_IMPL_IPP);
   1203             return;
   1204         }
   1205         setIppErrorStatus();
   1206     }
   1207 #endif
   1208     vBinOp32<float, OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, sz);
   1209 }
   1210 
   1211 static void min64f( const double* src1, size_t step1,
   1212                     const double* src2, size_t step2,
   1213                     double* dst, size_t step, Size sz, void* )
   1214 {
   1215 #if ARITHM_USE_IPP == 1
   1216     CV_IPP_CHECK()
   1217     {
   1218         double* s1 = (double*)src1;
   1219         double* s2 = (double*)src2;
   1220         double* d  = dst;
   1221         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1222         int i = 0;
   1223         for(; i < sz.height; i++)
   1224         {
   1225             if (0 > ippsMinEvery_64f(s1, s2, d, sz.width))
   1226                 break;
   1227             s1 = (double*)((uchar*)s1 + step1);
   1228             s2 = (double*)((uchar*)s2 + step2);
   1229             d  = (double*)((uchar*)d + step);
   1230         }
   1231         if (i == sz.height)
   1232         {
   1233             CV_IMPL_ADD(CV_IMPL_IPP);
   1234             return;
   1235         }
   1236         setIppErrorStatus();
   1237     }
   1238 #endif
   1239     vBinOp64<double, OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, sz);
   1240 }
   1241 
   1242 static void absdiff8u( const uchar* src1, size_t step1,
   1243                        const uchar* src2, size_t step2,
   1244                        uchar* dst, size_t step, Size sz, void* )
   1245 {
   1246 #if (ARITHM_USE_IPP == 1)
   1247     CV_IPP_CHECK()
   1248     {
   1249         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1250         if (0 <= ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
   1251         {
   1252             CV_IMPL_ADD(CV_IMPL_IPP);
   1253             return;
   1254         }
   1255         setIppErrorStatus();
   1256     }
   1257 #endif
   1258     (vBinOp<uchar, OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, sz));
   1259 }
   1260 
   1261 static void absdiff8s( const schar* src1, size_t step1,
   1262                        const schar* src2, size_t step2,
   1263                        schar* dst, size_t step, Size sz, void* )
   1264 {
   1265     vBinOp<schar, OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, sz);
   1266 }
   1267 
   1268 static void absdiff16u( const ushort* src1, size_t step1,
   1269                         const ushort* src2, size_t step2,
   1270                         ushort* dst, size_t step, Size sz, void* )
   1271 {
   1272 #if (ARITHM_USE_IPP == 1)
   1273     CV_IPP_CHECK()
   1274     {
   1275         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1276         if (0 <= ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
   1277         {
   1278             CV_IMPL_ADD(CV_IMPL_IPP);
   1279             return;
   1280         }
   1281         setIppErrorStatus();
   1282     }
   1283 #endif
   1284     (vBinOp<ushort, OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, sz));
   1285 }
   1286 
   1287 static void absdiff16s( const short* src1, size_t step1,
   1288                         const short* src2, size_t step2,
   1289                         short* dst, size_t step, Size sz, void* )
   1290 {
   1291     vBinOp<short, OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, sz);
   1292 }
   1293 
   1294 static void absdiff32s( const int* src1, size_t step1,
   1295                         const int* src2, size_t step2,
   1296                         int* dst, size_t step, Size sz, void* )
   1297 {
   1298     vBinOp32<int, OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, sz);
   1299 }
   1300 
   1301 static void absdiff32f( const float* src1, size_t step1,
   1302                         const float* src2, size_t step2,
   1303                         float* dst, size_t step, Size sz, void* )
   1304 {
   1305 #if (ARITHM_USE_IPP == 1)
   1306     CV_IPP_CHECK()
   1307     {
   1308         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1309         if (0 <= ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
   1310         {
   1311             CV_IMPL_ADD(CV_IMPL_IPP);
   1312             return;
   1313         }
   1314         setIppErrorStatus();
   1315     }
   1316 #endif
   1317     (vBinOp32<float, OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, sz));
   1318 }
   1319 
   1320 static void absdiff64f( const double* src1, size_t step1,
   1321                         const double* src2, size_t step2,
   1322                         double* dst, size_t step, Size sz, void* )
   1323 {
   1324     vBinOp64<double, OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, sz);
   1325 }
   1326 
   1327 
   1328 static void and8u( const uchar* src1, size_t step1,
   1329                    const uchar* src2, size_t step2,
   1330                    uchar* dst, size_t step, Size sz, void* )
   1331 {
   1332 #if (ARITHM_USE_IPP == 1)
   1333     CV_IPP_CHECK()
   1334     {
   1335         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1336         if (0 <= ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
   1337         {
   1338             CV_IMPL_ADD(CV_IMPL_IPP);
   1339             return;
   1340         }
   1341         setIppErrorStatus();
   1342     }
   1343 #endif
   1344     (vBinOp<uchar, OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
   1345 }
   1346 
   1347 static void or8u( const uchar* src1, size_t step1,
   1348                   const uchar* src2, size_t step2,
   1349                   uchar* dst, size_t step, Size sz, void* )
   1350 {
   1351 #if (ARITHM_USE_IPP == 1)
   1352     CV_IPP_CHECK()
   1353     {
   1354         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1355         if (0 <= ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
   1356         {
   1357             CV_IMPL_ADD(CV_IMPL_IPP);
   1358             return;
   1359         }
   1360         setIppErrorStatus();
   1361     }
   1362 #endif
   1363     (vBinOp<uchar, OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, sz));
   1364 }
   1365 
   1366 static void xor8u( const uchar* src1, size_t step1,
   1367                    const uchar* src2, size_t step2,
   1368                    uchar* dst, size_t step, Size sz, void* )
   1369 {
   1370 #if (ARITHM_USE_IPP == 1)
   1371     CV_IPP_CHECK()
   1372     {
   1373         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
   1374         if (0 <= ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
   1375         {
   1376             CV_IMPL_ADD(CV_IMPL_IPP);
   1377             return;
   1378         }
   1379         setIppErrorStatus();
   1380     }
   1381 #endif
   1382     (vBinOp<uchar, OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, sz));
   1383 }
   1384 
   1385 static void not8u( const uchar* src1, size_t step1,
   1386                    const uchar* src2, size_t step2,
   1387                    uchar* dst, size_t step, Size sz, void* )
   1388 {
   1389 #if (ARITHM_USE_IPP == 1)
   1390     CV_IPP_CHECK()
   1391     {
   1392         fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void)src2;
   1393         if (0 <= ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, ippiSize(sz)))
   1394         {
   1395             CV_IMPL_ADD(CV_IMPL_IPP);
   1396             return;
   1397         }
   1398         setIppErrorStatus();
   1399     }
   1400 #endif
   1401     (vBinOp<uchar, OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, sz));
   1402 }
   1403 
   1404 /****************************************************************************************\
   1405 *                                   logical operations                                   *
   1406 \****************************************************************************************/
   1407 
   1408 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
   1409 {
   1410     int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
   1411     size_t esz = CV_ELEM_SIZE(buftype);
   1412     getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
   1413     // unroll the scalar
   1414     if( scn < cn )
   1415     {
   1416         CV_Assert( scn == 1 );
   1417         size_t esz1 = CV_ELEM_SIZE1(buftype);
   1418         for( size_t i = esz1; i < esz; i++ )
   1419             scbuf[i] = scbuf[i - esz1];
   1420     }
   1421     for( size_t i = esz; i < blocksize*esz; i++ )
   1422         scbuf[i] = scbuf[i - esz];
   1423 }
   1424 
   1425 
   1426 enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
   1427        OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
   1428        OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
   1429        OCL_OP_RDIV_SCALE=15 };
   1430 
   1431 #ifdef HAVE_OPENCL
   1432 
   1433 static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
   1434     "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
   1435     "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 };
   1436 
   1437 static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
   1438                           InputArray _mask, bool bitwise, int oclop, bool haveScalar )
   1439 {
   1440     bool haveMask = !_mask.empty();
   1441     int srctype = _src1.type();
   1442     int srcdepth = CV_MAT_DEPTH(srctype);
   1443     int cn = CV_MAT_CN(srctype);
   1444 
   1445     const ocl::Device d = ocl::Device::getDefault();
   1446     bool doubleSupport = d.doubleFPConfig() > 0;
   1447     if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
   1448             (!doubleSupport && srcdepth == CV_64F && !bitwise))
   1449         return false;
   1450 
   1451     char opts[1024];
   1452     int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
   1453     int scalarcn = kercn == 3 ? 4 : kercn;
   1454     int rowsPerWI = d.isIntel() ? 4 : 1;
   1455 
   1456     sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
   1457             haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
   1458             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
   1459                 ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
   1460             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
   1461                 ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
   1462             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
   1463                 ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
   1464             kercn, rowsPerWI);
   1465 
   1466     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
   1467     if (k.empty())
   1468         return false;
   1469 
   1470     UMat src1 = _src1.getUMat(), src2;
   1471     UMat dst = _dst.getUMat(), mask = _mask.getUMat();
   1472 
   1473     ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
   1474     ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
   1475                                        ocl::KernelArg::WriteOnly(dst, cn, kercn);
   1476     ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
   1477 
   1478     if( haveScalar )
   1479     {
   1480         size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
   1481         double buf[4] = {0,0,0,0};
   1482 
   1483         if( oclop != OCL_OP_NOT )
   1484         {
   1485             Mat src2sc = _src2.getMat();
   1486             convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
   1487         }
   1488 
   1489         ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
   1490 
   1491         if( !haveMask )
   1492             k.args(src1arg, dstarg, scalararg);
   1493         else
   1494             k.args(src1arg, maskarg, dstarg, scalararg);
   1495     }
   1496     else
   1497     {
   1498         src2 = _src2.getUMat();
   1499         ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
   1500 
   1501         if( !haveMask )
   1502             k.args(src1arg, src2arg, dstarg);
   1503         else
   1504             k.args(src1arg, src2arg, maskarg, dstarg);
   1505     }
   1506 
   1507     size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
   1508     return k.run(2, globalsize, 0, false);
   1509 }
   1510 
   1511 #endif
   1512 
   1513 static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
   1514                        InputArray _mask, const BinaryFunc* tab,
   1515                        bool bitwise, int oclop )
   1516 {
   1517     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
   1518     int kind1 = psrc1->kind(), kind2 = psrc2->kind();
   1519     int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
   1520     int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
   1521     int dims1 = psrc1->dims(), dims2 = psrc2->dims();
   1522     Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
   1523     Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
   1524 #ifdef HAVE_OPENCL
   1525     bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
   1526             dims1 <= 2 && dims2 <= 2;
   1527 #endif
   1528     bool haveMask = !_mask.empty(), haveScalar = false;
   1529     BinaryFunc func;
   1530 
   1531     if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
   1532     {
   1533         _dst.create(sz1, type1);
   1534         CV_OCL_RUN(use_opencl,
   1535                    ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false))
   1536 
   1537         if( bitwise )
   1538         {
   1539             func = *tab;
   1540             cn = (int)CV_ELEM_SIZE(type1);
   1541         }
   1542         else
   1543             func = tab[depth1];
   1544 
   1545         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
   1546         Size sz = getContinuousSize(src1, src2, dst);
   1547         size_t len = sz.width*(size_t)cn;
   1548         if( len == (size_t)(int)len )
   1549         {
   1550             sz.width = (int)len;
   1551             func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0);
   1552             return;
   1553         }
   1554     }
   1555 
   1556     if( oclop == OCL_OP_NOT )
   1557         haveScalar = true;
   1558     else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
   1559         !psrc1->sameSize(*psrc2) || type1 != type2 )
   1560     {
   1561         if( checkScalar(*psrc1, type2, kind1, kind2) )
   1562         {
   1563             // src1 is a scalar; swap it with src2
   1564             swap(psrc1, psrc2);
   1565             swap(type1, type2);
   1566             swap(depth1, depth2);
   1567             swap(cn, cn2);
   1568             swap(sz1, sz2);
   1569         }
   1570         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
   1571             CV_Error( CV_StsUnmatchedSizes,
   1572                       "The operation is neither 'array op array' (where arrays have the same size and type), "
   1573                       "nor 'array op scalar', nor 'scalar op array'" );
   1574         haveScalar = true;
   1575     }
   1576     else
   1577     {
   1578         CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
   1579     }
   1580 
   1581     size_t esz = CV_ELEM_SIZE(type1);
   1582     size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
   1583     BinaryFunc copymask = 0;
   1584     bool reallocate = false;
   1585 
   1586     if( haveMask )
   1587     {
   1588         int mtype = _mask.type();
   1589         CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
   1590         copymask = getCopyMaskFunc(esz);
   1591         reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
   1592     }
   1593 
   1594     AutoBuffer<uchar> _buf;
   1595     uchar *scbuf = 0, *maskbuf = 0;
   1596 
   1597     _dst.createSameSize(*psrc1, type1);
   1598     // if this is mask operation and dst has been reallocated,
   1599     // we have to clear the destination
   1600     if( haveMask && reallocate )
   1601         _dst.setTo(0.);
   1602 
   1603     CV_OCL_RUN(use_opencl,
   1604                ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar))
   1605 
   1606 
   1607     Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
   1608     Mat dst = _dst.getMat(), mask = _mask.getMat();
   1609 
   1610     if( bitwise )
   1611     {
   1612         func = *tab;
   1613         cn = (int)esz;
   1614     }
   1615     else
   1616         func = tab[depth1];
   1617 
   1618     if( !haveScalar )
   1619     {
   1620         const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
   1621         uchar* ptrs[4];
   1622 
   1623         NAryMatIterator it(arrays, ptrs);
   1624         size_t total = it.size, blocksize = total;
   1625 
   1626         if( blocksize*cn > INT_MAX )
   1627             blocksize = INT_MAX/cn;
   1628 
   1629         if( haveMask )
   1630         {
   1631             blocksize = std::min(blocksize, blocksize0);
   1632             _buf.allocate(blocksize*esz);
   1633             maskbuf = _buf;
   1634         }
   1635 
   1636         for( size_t i = 0; i < it.nplanes; i++, ++it )
   1637         {
   1638             for( size_t j = 0; j < total; j += blocksize )
   1639             {
   1640                 int bsz = (int)MIN(total - j, blocksize);
   1641 
   1642                 func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 );
   1643                 if( haveMask )
   1644                 {
   1645                     copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
   1646                     ptrs[3] += bsz;
   1647                 }
   1648 
   1649                 bsz *= (int)esz;
   1650                 ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
   1651             }
   1652         }
   1653     }
   1654     else
   1655     {
   1656         const Mat* arrays[] = { &src1, &dst, &mask, 0 };
   1657         uchar* ptrs[3];
   1658 
   1659         NAryMatIterator it(arrays, ptrs);
   1660         size_t total = it.size, blocksize = std::min(total, blocksize0);
   1661 
   1662         _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
   1663         scbuf = _buf;
   1664         maskbuf = alignPtr(scbuf + blocksize*esz, 16);
   1665 
   1666         convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
   1667 
   1668         for( size_t i = 0; i < it.nplanes; i++, ++it )
   1669         {
   1670             for( size_t j = 0; j < total; j += blocksize )
   1671             {
   1672                 int bsz = (int)MIN(total - j, blocksize);
   1673 
   1674                 func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 );
   1675                 if( haveMask )
   1676                 {
   1677                     copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
   1678                     ptrs[2] += bsz;
   1679                 }
   1680 
   1681                 bsz *= (int)esz;
   1682                 ptrs[0] += bsz; ptrs[1] += bsz;
   1683             }
   1684         }
   1685     }
   1686 }
   1687 
   1688 static BinaryFunc* getMaxTab()
   1689 {
   1690     static BinaryFunc maxTab[] =
   1691     {
   1692         (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s),
   1693         (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s),
   1694         (BinaryFunc)GET_OPTIMIZED(max32s),
   1695         (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f,
   1696         0
   1697     };
   1698 
   1699     return maxTab;
   1700 }
   1701 
   1702 static BinaryFunc* getMinTab()
   1703 {
   1704     static BinaryFunc minTab[] =
   1705     {
   1706         (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s),
   1707         (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s),
   1708         (BinaryFunc)GET_OPTIMIZED(min32s),
   1709         (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f,
   1710         0
   1711     };
   1712 
   1713     return minTab;
   1714 }
   1715 
   1716 }
   1717 
   1718 void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
   1719 {
   1720     BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u);
   1721     binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
   1722 }
   1723 
   1724 void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
   1725 {
   1726     BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u);
   1727     binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
   1728 }
   1729 
   1730 void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
   1731 {
   1732     BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u);
   1733     binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
   1734 }
   1735 
   1736 void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
   1737 {
   1738     BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u);
   1739     binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
   1740 }
   1741 
   1742 void cv::max( InputArray src1, InputArray src2, OutputArray dst )
   1743 {
   1744     binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
   1745 }
   1746 
   1747 void cv::min( InputArray src1, InputArray src2, OutputArray dst )
   1748 {
   1749     binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
   1750 }
   1751 
   1752 void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
   1753 {
   1754     OutputArray _dst(dst);
   1755     binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
   1756 }
   1757 
   1758 void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
   1759 {
   1760     OutputArray _dst(dst);
   1761     binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
   1762 }
   1763 
   1764 void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
   1765 {
   1766     OutputArray _dst(dst);
   1767     binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
   1768 }
   1769 
   1770 void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
   1771 {
   1772     OutputArray _dst(dst);
   1773     binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
   1774 }
   1775 
   1776 
   1777 /****************************************************************************************\
   1778 *                                      add/subtract                                      *
   1779 \****************************************************************************************/
   1780 
   1781 namespace cv
   1782 {
   1783 
   1784 static int actualScalarDepth(const double* data, int len)
   1785 {
   1786     int i = 0, minval = INT_MAX, maxval = INT_MIN;
   1787     for(; i < len; ++i)
   1788     {
   1789         int ival = cvRound(data[i]);
   1790         if( ival != data[i] )
   1791             break;
   1792         minval = MIN(minval, ival);
   1793         maxval = MAX(maxval, ival);
   1794     }
   1795     return i < len ? CV_64F :
   1796         minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U :
   1797         minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S :
   1798         minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U :
   1799         minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S :
   1800         CV_32S;
   1801 }
   1802 
   1803 #ifdef HAVE_OPENCL
   1804 
   1805 static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
   1806                           InputArray _mask, int wtype,
   1807                           void* usrdata, int oclop,
   1808                           bool haveScalar )
   1809 {
   1810     const ocl::Device d = ocl::Device::getDefault();
   1811     bool doubleSupport = d.doubleFPConfig() > 0;
   1812     int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
   1813     bool haveMask = !_mask.empty();
   1814 
   1815     if ( (haveMask || haveScalar) && cn > 4 )
   1816         return false;
   1817 
   1818     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
   1819     if (!doubleSupport)
   1820         wdepth = std::min(wdepth, CV_32F);
   1821 
   1822     wtype = CV_MAKETYPE(wdepth, cn);
   1823     int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2);
   1824     if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
   1825         return false;
   1826 
   1827     int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
   1828     int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
   1829 
   1830     char cvtstr[4][32], opts[1024];
   1831     sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
   1832             "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
   1833             "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s",
   1834             (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
   1835             oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
   1836             ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
   1837             ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
   1838             ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
   1839             ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
   1840             ocl::typeToStr(wdepth), wdepth,
   1841             ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
   1842             ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
   1843             ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
   1844             doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI,
   1845             oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ?
   1846             ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert");
   1847 
   1848     size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
   1849     const uchar* usrdata_p = (const uchar*)usrdata;
   1850     const double* usrdata_d = (const double*)usrdata;
   1851     float usrdata_f[3];
   1852     int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
   1853         oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
   1854     if( n > 0 && wdepth == CV_32F )
   1855     {
   1856         for( i = 0; i < n; i++ )
   1857             usrdata_f[i] = (float)usrdata_d[i];
   1858         usrdata_p = (const uchar*)usrdata_f;
   1859     }
   1860 
   1861     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
   1862     if (k.empty())
   1863         return false;
   1864 
   1865     UMat src1 = _src1.getUMat(), src2;
   1866     UMat dst = _dst.getUMat(), mask = _mask.getUMat();
   1867 
   1868     ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
   1869     ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
   1870                                        ocl::KernelArg::WriteOnly(dst, cn, kercn);
   1871     ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
   1872 
   1873     if( haveScalar )
   1874     {
   1875         size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn;
   1876         double buf[4]={0,0,0,0};
   1877         Mat src2sc = _src2.getMat();
   1878 
   1879         if( !src2sc.empty() )
   1880             convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
   1881         ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
   1882 
   1883         if( !haveMask )
   1884         {
   1885             if(n == 0)
   1886                 k.args(src1arg, dstarg, scalararg);
   1887             else if(n == 1)
   1888                 k.args(src1arg, dstarg, scalararg,
   1889                        ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
   1890             else
   1891                 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
   1892         }
   1893         else
   1894             k.args(src1arg, maskarg, dstarg, scalararg);
   1895     }
   1896     else
   1897     {
   1898         src2 = _src2.getUMat();
   1899         ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
   1900 
   1901         if( !haveMask )
   1902         {
   1903             if (n == 0)
   1904                 k.args(src1arg, src2arg, dstarg);
   1905             else if (n == 1)
   1906                 k.args(src1arg, src2arg, dstarg,
   1907                        ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
   1908             else if (n == 3)
   1909                 k.args(src1arg, src2arg, dstarg,
   1910                        ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
   1911                        ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
   1912                        ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
   1913             else
   1914                 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
   1915         }
   1916         else
   1917             k.args(src1arg, src2arg, maskarg, dstarg);
   1918     }
   1919 
   1920     size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
   1921     return k.run(2, globalsize, NULL, false);
   1922 }
   1923 
   1924 #endif
   1925 
   1926 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
   1927                       InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
   1928                       void* usrdata=0, int oclop=-1 )
   1929 {
   1930     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
   1931     int kind1 = psrc1->kind(), kind2 = psrc2->kind();
   1932     bool haveMask = !_mask.empty();
   1933     bool reallocate = false;
   1934     int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
   1935     int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
   1936     int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
   1937     Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
   1938     Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
   1939 #ifdef HAVE_OPENCL
   1940     bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2;
   1941 #endif
   1942     bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
   1943     bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
   1944 
   1945     if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
   1946         !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
   1947                        (_dst.fixedType() && _dst.type() == type1)) &&
   1948         ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
   1949     {
   1950         _dst.createSameSize(*psrc1, type1);
   1951         CV_OCL_RUN(use_opencl,
   1952             ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
   1953                           (!usrdata ? type1 : std::max(depth1, CV_32F)),
   1954                           usrdata, oclop, false))
   1955 
   1956         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
   1957         Size sz = getContinuousSize(src1, src2, dst, src1.channels());
   1958         tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, usrdata);
   1959         return;
   1960     }
   1961 
   1962     bool haveScalar = false, swapped12 = false;
   1963 
   1964     if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
   1965         (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
   1966         (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
   1967     {
   1968         if( checkScalar(*psrc1, type2, kind1, kind2) )
   1969         {
   1970             // src1 is a scalar; swap it with src2
   1971             swap(psrc1, psrc2);
   1972             swap(sz1, sz2);
   1973             swap(type1, type2);
   1974             swap(depth1, depth2);
   1975             swap(cn, cn2);
   1976             swap(dims1, dims2);
   1977             swapped12 = true;
   1978             if( oclop == OCL_OP_SUB )
   1979                 oclop = OCL_OP_RSUB;
   1980             if ( oclop == OCL_OP_DIV_SCALE )
   1981                 oclop = OCL_OP_RDIV_SCALE;
   1982         }
   1983         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
   1984             CV_Error( CV_StsUnmatchedSizes,
   1985                      "The operation is neither 'array op array' "
   1986                      "(where arrays have the same size and the same number of channels), "
   1987                      "nor 'array op scalar', nor 'scalar op array'" );
   1988         haveScalar = true;
   1989         CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));
   1990 
   1991         if (!muldiv)
   1992         {
   1993             Mat sc = psrc2->getMat();
   1994             depth2 = actualScalarDepth(sc.ptr<double>(), cn);
   1995             if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
   1996                 depth2 = CV_32F;
   1997         }
   1998         else
   1999             depth2 = CV_64F;
   2000     }
   2001 
   2002     if( dtype < 0 )
   2003     {
   2004         if( _dst.fixedType() )
   2005             dtype = _dst.type();
   2006         else
   2007         {
   2008             if( !haveScalar && type1 != type2 )
   2009                 CV_Error(CV_StsBadArg,
   2010                      "When the input arrays in add/subtract/multiply/divide functions have different types, "
   2011                      "the output array type must be explicitly specified");
   2012             dtype = type1;
   2013         }
   2014     }
   2015     dtype = CV_MAT_DEPTH(dtype);
   2016 
   2017     if( depth1 == depth2 && dtype == depth1 )
   2018         wtype = dtype;
   2019     else if( !muldiv )
   2020     {
   2021         wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
   2022                 depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
   2023         wtype = std::max(wtype, dtype);
   2024 
   2025         // when the result of addition should be converted to an integer type,
   2026         // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
   2027         // instead of converting the other input to floating-point and then converting the operation result back to integers.
   2028         if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) )
   2029             wtype = CV_32S;
   2030     }
   2031     else
   2032     {
   2033         wtype = std::max(depth1, std::max(depth2, CV_32F));
   2034         wtype = std::max(wtype, dtype);
   2035     }
   2036 
   2037     dtype = CV_MAKETYPE(dtype, cn);
   2038     wtype = CV_MAKETYPE(wtype, cn);
   2039 
   2040     if( haveMask )
   2041     {
   2042         int mtype = _mask.type();
   2043         CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
   2044         reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
   2045     }
   2046 
   2047     _dst.createSameSize(*psrc1, dtype);
   2048     if( reallocate )
   2049         _dst.setTo(0.);
   2050 
   2051     CV_OCL_RUN(use_opencl,
   2052                ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
   2053                usrdata, oclop, haveScalar))
   2054 
   2055     BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
   2056     BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
   2057     BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
   2058 
   2059     size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
   2060     size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
   2061     size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
   2062     BinaryFunc copymask = getCopyMaskFunc(dsz);
   2063     Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
   2064 
   2065     AutoBuffer<uchar> _buf;
   2066     uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
   2067     size_t bufesz = (cvtsrc1 ? wsz : 0) +
   2068                     (cvtsrc2 || haveScalar ? wsz : 0) +
   2069                     (cvtdst ? wsz : 0) +
   2070                     (haveMask ? dsz : 0);
   2071     BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];
   2072 
   2073     if( !haveScalar )
   2074     {
   2075         const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
   2076         uchar* ptrs[4];
   2077 
   2078         NAryMatIterator it(arrays, ptrs);
   2079         size_t total = it.size, blocksize = total;
   2080 
   2081         if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
   2082             blocksize = std::min(blocksize, blocksize0);
   2083 
   2084         _buf.allocate(bufesz*blocksize + 64);
   2085         buf = _buf;
   2086         if( cvtsrc1 )
   2087             buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
   2088         if( cvtsrc2 )
   2089             buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
   2090         wbuf = maskbuf = buf;
   2091         if( cvtdst )
   2092             buf = alignPtr(buf + blocksize*wsz, 16);
   2093         if( haveMask )
   2094             maskbuf = buf;
   2095 
   2096         for( size_t i = 0; i < it.nplanes; i++, ++it )
   2097         {
   2098             for( size_t j = 0; j < total; j += blocksize )
   2099             {
   2100                 int bsz = (int)MIN(total - j, blocksize);
   2101                 Size bszn(bsz*cn, 1);
   2102                 const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
   2103                 uchar* dptr = ptrs[2];
   2104                 if( cvtsrc1 )
   2105                 {
   2106                     cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
   2107                     sptr1 = buf1;
   2108                 }
   2109                 if( ptrs[0] == ptrs[1] )
   2110                     sptr2 = sptr1;
   2111                 else if( cvtsrc2 )
   2112                 {
   2113                     cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
   2114                     sptr2 = buf2;
   2115                 }
   2116 
   2117                 if( !haveMask && !cvtdst )
   2118                     func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
   2119                 else
   2120                 {
   2121                     func( sptr1, 1, sptr2, 1, wbuf, 0, bszn, usrdata );
   2122                     if( !haveMask )
   2123                         cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
   2124                     else if( !cvtdst )
   2125                     {
   2126                         copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
   2127                         ptrs[3] += bsz;
   2128                     }
   2129                     else
   2130                     {
   2131                         cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
   2132                         copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
   2133                         ptrs[3] += bsz;
   2134                     }
   2135                 }
   2136                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
   2137             }
   2138         }
   2139     }
   2140     else
   2141     {
   2142         const Mat* arrays[] = { &src1, &dst, &mask, 0 };
   2143         uchar* ptrs[3];
   2144 
   2145         NAryMatIterator it(arrays, ptrs);
   2146         size_t total = it.size, blocksize = std::min(total, blocksize0);
   2147 
   2148         _buf.allocate(bufesz*blocksize + 64);
   2149         buf = _buf;
   2150         if( cvtsrc1 )
   2151             buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
   2152         buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
   2153         wbuf = maskbuf = buf;
   2154         if( cvtdst )
   2155             buf = alignPtr(buf + blocksize*wsz, 16);
   2156         if( haveMask )
   2157             maskbuf = buf;
   2158 
   2159         convertAndUnrollScalar( src2, wtype, buf2, blocksize);
   2160 
   2161         for( size_t i = 0; i < it.nplanes; i++, ++it )
   2162         {
   2163             for( size_t j = 0; j < total; j += blocksize )
   2164             {
   2165                 int bsz = (int)MIN(total - j, blocksize);
   2166                 Size bszn(bsz*cn, 1);
   2167                 const uchar *sptr1 = ptrs[0];
   2168                 const uchar* sptr2 = buf2;
   2169                 uchar* dptr = ptrs[1];
   2170 
   2171                 if( cvtsrc1 )
   2172                 {
   2173                     cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
   2174                     sptr1 = buf1;
   2175                 }
   2176 
   2177                 if( swapped12 )
   2178                     std::swap(sptr1, sptr2);
   2179 
   2180                 if( !haveMask && !cvtdst )
   2181                     func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
   2182                 else
   2183                 {
   2184                     func( sptr1, 1, sptr2, 1, wbuf, 1, bszn, usrdata );
   2185                     if( !haveMask )
   2186                         cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
   2187                     else if( !cvtdst )
   2188                     {
   2189                         copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
   2190                         ptrs[2] += bsz;
   2191                     }
   2192                     else
   2193                     {
   2194                         cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
   2195                         copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
   2196                         ptrs[2] += bsz;
   2197                     }
   2198                 }
   2199                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
   2200             }
   2201         }
   2202     }
   2203 }
   2204 
   2205 static BinaryFunc* getAddTab()
   2206 {
   2207     static BinaryFunc addTab[] =
   2208     {
   2209         (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s),
   2210         (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s),
   2211         (BinaryFunc)GET_OPTIMIZED(add32s),
   2212         (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f,
   2213         0
   2214     };
   2215 
   2216     return addTab;
   2217 }
   2218 
   2219 static BinaryFunc* getSubTab()
   2220 {
   2221     static BinaryFunc subTab[] =
   2222     {
   2223         (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s),
   2224         (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s),
   2225         (BinaryFunc)GET_OPTIMIZED(sub32s),
   2226         (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f,
   2227         0
   2228     };
   2229 
   2230     return subTab;
   2231 }
   2232 
   2233 static BinaryFunc* getAbsDiffTab()
   2234 {
   2235     static BinaryFunc absDiffTab[] =
   2236     {
   2237         (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s),
   2238         (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s),
   2239         (BinaryFunc)GET_OPTIMIZED(absdiff32s),
   2240         (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f,
   2241         0
   2242     };
   2243 
   2244     return absDiffTab;
   2245 }
   2246 
   2247 }
   2248 
   2249 void cv::add( InputArray src1, InputArray src2, OutputArray dst,
   2250           InputArray mask, int dtype )
   2251 {
   2252     arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
   2253 }
   2254 
   2255 void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst,
   2256                InputArray mask, int dtype )
   2257 {
   2258 #ifdef HAVE_TEGRA_OPTIMIZATION
   2259     if (tegra::useTegra())
   2260     {
   2261         int kind1 = _src1.kind(), kind2 = _src2.kind();
   2262         Mat src1 = _src1.getMat(), src2 = _src2.getMat();
   2263         bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2);
   2264         bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1);
   2265 
   2266         if (!src1Scalar && !src2Scalar &&
   2267             src1.depth() == CV_8U && src2.type() == src1.type() &&
   2268             src1.dims == 2 && src2.size() == src1.size() &&
   2269             mask.empty())
   2270         {
   2271             if (dtype < 0)
   2272             {
   2273                 if (_dst.fixedType())
   2274                 {
   2275                     dtype = _dst.depth();
   2276                 }
   2277                 else
   2278                 {
   2279                     dtype = src1.depth();
   2280                 }
   2281             }
   2282 
   2283             dtype = CV_MAT_DEPTH(dtype);
   2284 
   2285             if (!_dst.fixedType() || dtype == _dst.depth())
   2286             {
   2287                 _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels()));
   2288 
   2289                 if (dtype == CV_16S)
   2290                 {
   2291                     Mat dst = _dst.getMat();
   2292                     if(tegra::subtract_8u8u16s(src1, src2, dst))
   2293                         return;
   2294                 }
   2295                 else if (dtype == CV_32F)
   2296                 {
   2297                     Mat dst = _dst.getMat();
   2298                     if(tegra::subtract_8u8u32f(src1, src2, dst))
   2299                         return;
   2300                 }
   2301                 else if (dtype == CV_8S)
   2302                 {
   2303                     Mat dst = _dst.getMat();
   2304                     if(tegra::subtract_8u8u8s(src1, src2, dst))
   2305                         return;
   2306                 }
   2307             }
   2308         }
   2309     }
   2310 #endif
   2311     arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
   2312 }
   2313 
   2314 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
   2315 {
   2316     arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
   2317 }
   2318 
   2319 /****************************************************************************************\
   2320 *                                    multiply/divide                                     *
   2321 \****************************************************************************************/
   2322 
   2323 namespace cv
   2324 {
   2325 
   2326 template <typename T, typename WT>
   2327 struct Mul_SIMD
   2328 {
   2329     int operator() (const T *, const T *, T *, int, WT) const
   2330     {
   2331         return 0;
   2332     }
   2333 };
   2334 
   2335 #if CV_NEON
   2336 
   2337 template <>
   2338 struct Mul_SIMD<uchar, float>
   2339 {
   2340     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
   2341     {
   2342         int x = 0;
   2343 
   2344         if( scale == 1.0f )
   2345             for ( ; x <= width - 8; x += 8)
   2346             {
   2347                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
   2348                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
   2349 
   2350                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
   2351                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
   2352                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
   2353                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
   2354 
   2355                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   2356                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   2357                 vst1_u8(dst + x, vqmovn_u16(v_dst));
   2358             }
   2359         else
   2360         {
   2361             float32x4_t v_scale = vdupq_n_f32(scale);
   2362             for ( ; x <= width - 8; x += 8)
   2363             {
   2364                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
   2365                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
   2366 
   2367                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
   2368                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
   2369                 v_dst1 = vmulq_f32(v_dst1, v_scale);
   2370                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
   2371                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
   2372                 v_dst2 = vmulq_f32(v_dst2, v_scale);
   2373 
   2374                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   2375                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   2376                 vst1_u8(dst + x, vqmovn_u16(v_dst));
   2377             }
   2378         }
   2379 
   2380         return x;
   2381     }
   2382 };
   2383 
   2384 template <>
   2385 struct Mul_SIMD<schar, float>
   2386 {
   2387     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
   2388     {
   2389         int x = 0;
   2390 
   2391         if( scale == 1.0f )
   2392             for ( ; x <= width - 8; x += 8)
   2393             {
   2394                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
   2395                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
   2396 
   2397                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
   2398                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
   2399                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
   2400                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
   2401 
   2402                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   2403                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   2404                 vst1_s8(dst + x, vqmovn_s16(v_dst));
   2405             }
   2406         else
   2407         {
   2408             float32x4_t v_scale = vdupq_n_f32(scale);
   2409             for ( ; x <= width - 8; x += 8)
   2410             {
   2411                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
   2412                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
   2413 
   2414                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
   2415                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
   2416                 v_dst1 = vmulq_f32(v_dst1, v_scale);
   2417                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
   2418                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
   2419                 v_dst2 = vmulq_f32(v_dst2, v_scale);
   2420 
   2421                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   2422                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   2423                 vst1_s8(dst + x, vqmovn_s16(v_dst));
   2424             }
   2425         }
   2426 
   2427         return x;
   2428     }
   2429 };
   2430 
   2431 template <>
   2432 struct Mul_SIMD<ushort, float>
   2433 {
   2434     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
   2435     {
   2436         int x = 0;
   2437 
   2438         if( scale == 1.0f )
   2439             for ( ; x <= width - 8; x += 8)
   2440             {
   2441                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
   2442 
   2443                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
   2444                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
   2445                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
   2446                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
   2447 
   2448                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   2449                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   2450                 vst1q_u16(dst + x, v_dst);
   2451             }
   2452         else
   2453         {
   2454             float32x4_t v_scale = vdupq_n_f32(scale);
   2455             for ( ; x <= width - 8; x += 8)
   2456             {
   2457                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
   2458 
   2459                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
   2460                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
   2461                 v_dst1 = vmulq_f32(v_dst1, v_scale);
   2462                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
   2463                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
   2464                 v_dst2 = vmulq_f32(v_dst2, v_scale);
   2465 
   2466                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   2467                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   2468                 vst1q_u16(dst + x, v_dst);
   2469             }
   2470         }
   2471 
   2472         return x;
   2473     }
   2474 };
   2475 
   2476 template <>
   2477 struct Mul_SIMD<short, float>
   2478 {
   2479     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
   2480     {
   2481         int x = 0;
   2482 
   2483         if( scale == 1.0f )
   2484             for ( ; x <= width - 8; x += 8)
   2485             {
   2486                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
   2487 
   2488                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
   2489                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
   2490                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
   2491                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
   2492 
   2493                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   2494                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   2495                 vst1q_s16(dst + x, v_dst);
   2496             }
   2497         else
   2498         {
   2499             float32x4_t v_scale = vdupq_n_f32(scale);
   2500             for ( ; x <= width - 8; x += 8)
   2501             {
   2502                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
   2503 
   2504                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
   2505                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
   2506                 v_dst1 = vmulq_f32(v_dst1, v_scale);
   2507                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
   2508                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
   2509                 v_dst2 = vmulq_f32(v_dst2, v_scale);
   2510 
   2511                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   2512                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   2513                 vst1q_s16(dst + x, v_dst);
   2514             }
   2515         }
   2516 
   2517         return x;
   2518     }
   2519 };
   2520 
   2521 template <>
   2522 struct Mul_SIMD<float, float>
   2523 {
   2524     int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
   2525     {
   2526         int x = 0;
   2527 
   2528         if( scale == 1.0f )
   2529             for ( ; x <= width - 8; x += 8)
   2530             {
   2531                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
   2532                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
   2533                 vst1q_f32(dst + x, v_dst1);
   2534                 vst1q_f32(dst + x + 4, v_dst2);
   2535             }
   2536         else
   2537         {
   2538             float32x4_t v_scale = vdupq_n_f32(scale);
   2539             for ( ; x <= width - 8; x += 8)
   2540             {
   2541                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
   2542                 v_dst1 = vmulq_f32(v_dst1, v_scale);
   2543 
   2544                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
   2545                 v_dst2 = vmulq_f32(v_dst2, v_scale);
   2546 
   2547                 vst1q_f32(dst + x, v_dst1);
   2548                 vst1q_f32(dst + x + 4, v_dst2);
   2549             }
   2550         }
   2551 
   2552         return x;
   2553     }
   2554 };
   2555 
   2556 #elif CV_SSE2
   2557 
   2558 #if CV_SSE4_1
   2559 
   2560 template <>
   2561 struct Mul_SIMD<ushort, float>
   2562 {
   2563     Mul_SIMD()
   2564     {
   2565         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
   2566     }
   2567 
   2568     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
   2569     {
   2570         int x = 0;
   2571 
   2572         if (!haveSSE)
   2573             return x;
   2574 
   2575         __m128i v_zero = _mm_setzero_si128();
   2576 
   2577         if( scale != 1.0f )
   2578         {
   2579             __m128 v_scale = _mm_set1_ps(scale);
   2580             for ( ; x <= width - 8; x += 8)
   2581             {
   2582                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
   2583                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
   2584 
   2585                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
   2586                                            _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
   2587                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
   2588 
   2589                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
   2590                                            _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
   2591                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
   2592 
   2593                 __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
   2594                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
   2595             }
   2596         }
   2597 
   2598         return x;
   2599     }
   2600 
   2601     bool haveSSE;
   2602 };
   2603 
   2604 #endif
   2605 
   2606 template <>
   2607 struct Mul_SIMD<schar, float>
   2608 {
   2609     Mul_SIMD()
   2610     {
   2611         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
   2612     }
   2613 
   2614     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
   2615     {
   2616         int x = 0;
   2617 
   2618         if (!haveSSE)
   2619             return x;
   2620 
   2621         __m128i v_zero = _mm_setzero_si128();
   2622 
   2623         if( scale == 1.0f )
   2624             for ( ; x <= width - 8; x += 8)
   2625             {
   2626                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
   2627                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
   2628 
   2629                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
   2630                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
   2631 
   2632                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
   2633                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
   2634 
   2635                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
   2636                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
   2637 
   2638                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
   2639                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
   2640             }
   2641         else
   2642         {
   2643             __m128 v_scale = _mm_set1_ps(scale);
   2644             for ( ; x <= width - 8; x += 8)
   2645             {
   2646                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
   2647                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
   2648 
   2649                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
   2650                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
   2651 
   2652                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
   2653                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
   2654                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
   2655 
   2656                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
   2657                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
   2658                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
   2659 
   2660                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
   2661                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
   2662             }
   2663         }
   2664 
   2665         return x;
   2666     }
   2667 
   2668     bool haveSSE;
   2669 };
   2670 
   2671 template <>
   2672 struct Mul_SIMD<short, float>
   2673 {
   2674     Mul_SIMD()
   2675     {
   2676         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
   2677     }
   2678 
   2679     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
   2680     {
   2681         int x = 0;
   2682 
   2683         if (!haveSSE)
   2684             return x;
   2685 
   2686         __m128i v_zero = _mm_setzero_si128();
   2687 
   2688         if( scale != 1.0f )
   2689         {
   2690             __m128 v_scale = _mm_set1_ps(scale);
   2691             for ( ; x <= width - 8; x += 8)
   2692             {
   2693                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
   2694                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
   2695 
   2696                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
   2697                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
   2698                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
   2699 
   2700                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
   2701                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
   2702                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
   2703 
   2704                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
   2705                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
   2706             }
   2707         }
   2708 
   2709         return x;
   2710     }
   2711 
   2712     bool haveSSE;
   2713 };
   2714 
   2715 #endif
   2716 
   2717 template<typename T, typename WT> static void
   2718 mul_( const T* src1, size_t step1, const T* src2, size_t step2,
   2719       T* dst, size_t step, Size size, WT scale )
   2720 {
   2721     step1 /= sizeof(src1[0]);
   2722     step2 /= sizeof(src2[0]);
   2723     step /= sizeof(dst[0]);
   2724 
   2725     Mul_SIMD<T, WT> vop;
   2726 
   2727     if( scale == (WT)1. )
   2728     {
   2729         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   2730         {
   2731             int i = vop(src1, src2, dst, size.width, scale);
   2732             #if CV_ENABLE_UNROLLED
   2733             for(; i <= size.width - 4; i += 4 )
   2734             {
   2735                 T t0;
   2736                 T t1;
   2737                 t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
   2738                 t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
   2739                 dst[i  ] = t0;
   2740                 dst[i+1] = t1;
   2741 
   2742                 t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
   2743                 t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
   2744                 dst[i+2] = t0;
   2745                 dst[i+3] = t1;
   2746             }
   2747             #endif
   2748             for( ; i < size.width; i++ )
   2749                 dst[i] = saturate_cast<T>(src1[i] * src2[i]);
   2750         }
   2751     }
   2752     else
   2753     {
   2754         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   2755         {
   2756             int i = vop(src1, src2, dst, size.width, scale);
   2757             #if CV_ENABLE_UNROLLED
   2758             for(; i <= size.width - 4; i += 4 )
   2759             {
   2760                 T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
   2761                 T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
   2762                 dst[i] = t0; dst[i+1] = t1;
   2763 
   2764                 t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
   2765                 t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
   2766                 dst[i+2] = t0; dst[i+3] = t1;
   2767             }
   2768             #endif
   2769             for( ; i < size.width; i++ )
   2770                 dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
   2771         }
   2772     }
   2773 }
   2774 
   2775 template <typename T>
   2776 struct Div_SIMD
   2777 {
   2778     int operator() (const T *, const T *, T *, int, double) const
   2779     {
   2780         return 0;
   2781     }
   2782 };
   2783 
   2784 template <typename T>
   2785 struct Recip_SIMD
   2786 {
   2787     int operator() (const T *, T *, int, double) const
   2788     {
   2789         return 0;
   2790     }
   2791 };
   2792 
   2793 
   2794 #if CV_SIMD128
   2795 
   2796 template <>
   2797 struct Div_SIMD<uchar>
   2798 {
   2799     bool haveSIMD;
   2800     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   2801 
   2802     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
   2803     {
   2804         int x = 0;
   2805 
   2806         if (!haveSIMD)
   2807             return x;
   2808 
   2809         v_float32x4 v_scale = v_setall_f32((float)scale);
   2810         v_uint16x8 v_zero = v_setzero_u16();
   2811 
   2812         for ( ; x <= width - 8; x += 8)
   2813         {
   2814             v_uint16x8 v_src1 = v_load_expand(src1 + x);
   2815             v_uint16x8 v_src2 = v_load_expand(src2 + x);
   2816 
   2817             v_uint32x4 t0, t1, t2, t3;
   2818             v_expand(v_src1, t0, t1);
   2819             v_expand(v_src2, t2, t3);
   2820 
   2821             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
   2822             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
   2823 
   2824             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
   2825             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
   2826 
   2827             f0 = f0 * v_scale / f2;
   2828             f1 = f1 * v_scale / f3;
   2829 
   2830             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
   2831             v_uint16x8 res = v_pack_u(i0, i1);
   2832 
   2833             res = v_select(v_src2 == v_zero, v_zero, res);
   2834             v_pack_store(dst + x, res);
   2835         }
   2836 
   2837         return x;
   2838     }
   2839 };
   2840 
   2841 
   2842 template <>
   2843 struct Div_SIMD<schar>
   2844 {
   2845     bool haveSIMD;
   2846     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   2847 
   2848     int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
   2849     {
   2850         int x = 0;
   2851 
   2852         if (!haveSIMD)
   2853             return x;
   2854 
   2855         v_float32x4 v_scale = v_setall_f32((float)scale);
   2856         v_int16x8 v_zero = v_setzero_s16();
   2857 
   2858         for ( ; x <= width - 8; x += 8)
   2859         {
   2860             v_int16x8 v_src1 = v_load_expand(src1 + x);
   2861             v_int16x8 v_src2 = v_load_expand(src2 + x);
   2862 
   2863             v_int32x4 t0, t1, t2, t3;
   2864             v_expand(v_src1, t0, t1);
   2865             v_expand(v_src2, t2, t3);
   2866 
   2867             v_float32x4 f0 = v_cvt_f32(t0);
   2868             v_float32x4 f1 = v_cvt_f32(t1);
   2869 
   2870             v_float32x4 f2 = v_cvt_f32(t2);
   2871             v_float32x4 f3 = v_cvt_f32(t3);
   2872 
   2873             f0 = f0 * v_scale / f2;
   2874             f1 = f1 * v_scale / f3;
   2875 
   2876             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
   2877             v_int16x8 res = v_pack(i0, i1);
   2878 
   2879             res = v_select(v_src2 == v_zero, v_zero, res);
   2880             v_pack_store(dst + x, res);
   2881         }
   2882 
   2883         return x;
   2884     }
   2885 };
   2886 
   2887 
   2888 template <>
   2889 struct Div_SIMD<ushort>
   2890 {
   2891     bool haveSIMD;
   2892     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   2893 
   2894     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
   2895     {
   2896         int x = 0;
   2897 
   2898         if (!haveSIMD)
   2899             return x;
   2900 
   2901         v_float32x4 v_scale = v_setall_f32((float)scale);
   2902         v_uint16x8 v_zero = v_setzero_u16();
   2903 
   2904         for ( ; x <= width - 8; x += 8)
   2905         {
   2906             v_uint16x8 v_src1 = v_load(src1 + x);
   2907             v_uint16x8 v_src2 = v_load(src2 + x);
   2908 
   2909             v_uint32x4 t0, t1, t2, t3;
   2910             v_expand(v_src1, t0, t1);
   2911             v_expand(v_src2, t2, t3);
   2912 
   2913             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
   2914             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
   2915 
   2916             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
   2917             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
   2918 
   2919             f0 = f0 * v_scale / f2;
   2920             f1 = f1 * v_scale / f3;
   2921 
   2922             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
   2923             v_uint16x8 res = v_pack_u(i0, i1);
   2924 
   2925             res = v_select(v_src2 == v_zero, v_zero, res);
   2926             v_store(dst + x, res);
   2927         }
   2928 
   2929         return x;
   2930     }
   2931 };
   2932 
   2933 template <>
   2934 struct Div_SIMD<short>
   2935 {
   2936     bool haveSIMD;
   2937     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   2938 
   2939     int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
   2940     {
   2941         int x = 0;
   2942 
   2943         if (!haveSIMD)
   2944             return x;
   2945 
   2946         v_float32x4 v_scale = v_setall_f32((float)scale);
   2947         v_int16x8 v_zero = v_setzero_s16();
   2948 
   2949         for ( ; x <= width - 8; x += 8)
   2950         {
   2951             v_int16x8 v_src1 = v_load(src1 + x);
   2952             v_int16x8 v_src2 = v_load(src2 + x);
   2953 
   2954             v_int32x4 t0, t1, t2, t3;
   2955             v_expand(v_src1, t0, t1);
   2956             v_expand(v_src2, t2, t3);
   2957 
   2958             v_float32x4 f0 = v_cvt_f32(t0);
   2959             v_float32x4 f1 = v_cvt_f32(t1);
   2960 
   2961             v_float32x4 f2 = v_cvt_f32(t2);
   2962             v_float32x4 f3 = v_cvt_f32(t3);
   2963 
   2964             f0 = f0 * v_scale / f2;
   2965             f1 = f1 * v_scale / f3;
   2966 
   2967             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
   2968             v_int16x8 res = v_pack(i0, i1);
   2969 
   2970             res = v_select(v_src2 == v_zero, v_zero, res);
   2971             v_store(dst + x, res);
   2972         }
   2973 
   2974         return x;
   2975     }
   2976 };
   2977 
   2978 template <>
   2979 struct Div_SIMD<int>
   2980 {
   2981     bool haveSIMD;
   2982     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   2983 
   2984     int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
   2985     {
   2986         int x = 0;
   2987 
   2988         if (!haveSIMD)
   2989             return x;
   2990 
   2991         v_float32x4 v_scale = v_setall_f32((float)scale);
   2992         v_int32x4 v_zero = v_setzero_s32();
   2993 
   2994         for ( ; x <= width - 8; x += 8)
   2995         {
   2996             v_int32x4 t0 = v_load(src1 + x);
   2997             v_int32x4 t1 = v_load(src1 + x + 4);
   2998             v_int32x4 t2 = v_load(src2 + x);
   2999             v_int32x4 t3 = v_load(src2 + x + 4);
   3000 
   3001             v_float32x4 f0 = v_cvt_f32(t0);
   3002             v_float32x4 f1 = v_cvt_f32(t1);
   3003             v_float32x4 f2 = v_cvt_f32(t2);
   3004             v_float32x4 f3 = v_cvt_f32(t3);
   3005 
   3006             f0 = f0 * v_scale / f2;
   3007             f1 = f1 * v_scale / f3;
   3008 
   3009             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
   3010 
   3011             res0 = v_select(t2 == v_zero, v_zero, res0);
   3012             res1 = v_select(t3 == v_zero, v_zero, res1);
   3013             v_store(dst + x, res0);
   3014             v_store(dst + x + 4, res1);
   3015         }
   3016 
   3017         return x;
   3018     }
   3019 };
   3020 
   3021 
   3022 template <>
   3023 struct Div_SIMD<float>
   3024 {
   3025     bool haveSIMD;
   3026     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   3027 
   3028     int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
   3029     {
   3030         int x = 0;
   3031 
   3032         if (!haveSIMD)
   3033             return x;
   3034 
   3035         v_float32x4 v_scale = v_setall_f32((float)scale);
   3036         v_float32x4 v_zero = v_setzero_f32();
   3037 
   3038         for ( ; x <= width - 8; x += 8)
   3039         {
   3040             v_float32x4 f0 = v_load(src1 + x);
   3041             v_float32x4 f1 = v_load(src1 + x + 4);
   3042             v_float32x4 f2 = v_load(src2 + x);
   3043             v_float32x4 f3 = v_load(src2 + x + 4);
   3044 
   3045             v_float32x4 res0 = f0 * v_scale / f2;
   3046             v_float32x4 res1 = f1 * v_scale / f3;
   3047 
   3048             res0 = v_select(f2 == v_zero, v_zero, res0);
   3049             res1 = v_select(f3 == v_zero, v_zero, res1);
   3050 
   3051             v_store(dst + x, res0);
   3052             v_store(dst + x + 4, res1);
   3053         }
   3054 
   3055         return x;
   3056     }
   3057 };
   3058 
   3059 
   3060 ///////////////////////// RECIPROCAL //////////////////////
   3061 
   3062 template <>
   3063 struct Recip_SIMD<uchar>
   3064 {
   3065     bool haveSIMD;
   3066     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   3067 
   3068     int operator() (const uchar * src2, uchar * dst, int width, double scale) const
   3069     {
   3070         int x = 0;
   3071 
   3072         if (!haveSIMD)
   3073             return x;
   3074 
   3075         v_float32x4 v_scale = v_setall_f32((float)scale);
   3076         v_uint16x8 v_zero = v_setzero_u16();
   3077 
   3078         for ( ; x <= width - 8; x += 8)
   3079         {
   3080             v_uint16x8 v_src2 = v_load_expand(src2 + x);
   3081 
   3082             v_uint32x4 t0, t1;
   3083             v_expand(v_src2, t0, t1);
   3084 
   3085             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
   3086             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
   3087 
   3088             f0 = v_scale / f0;
   3089             f1 = v_scale / f1;
   3090 
   3091             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
   3092             v_uint16x8 res = v_pack_u(i0, i1);
   3093 
   3094             res = v_select(v_src2 == v_zero, v_zero, res);
   3095             v_pack_store(dst + x, res);
   3096         }
   3097 
   3098         return x;
   3099     }
   3100 };
   3101 
   3102 
   3103 template <>
   3104 struct Recip_SIMD<schar>
   3105 {
   3106     bool haveSIMD;
   3107     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   3108 
   3109     int operator() (const schar * src2, schar * dst, int width, double scale) const
   3110     {
   3111         int x = 0;
   3112 
   3113         if (!haveSIMD)
   3114             return x;
   3115 
   3116         v_float32x4 v_scale = v_setall_f32((float)scale);
   3117         v_int16x8 v_zero = v_setzero_s16();
   3118 
   3119         for ( ; x <= width - 8; x += 8)
   3120         {
   3121             v_int16x8 v_src2 = v_load_expand(src2 + x);
   3122 
   3123             v_int32x4 t0, t1;
   3124             v_expand(v_src2, t0, t1);
   3125 
   3126             v_float32x4 f0 = v_cvt_f32(t0);
   3127             v_float32x4 f1 = v_cvt_f32(t1);
   3128 
   3129             f0 = v_scale / f0;
   3130             f1 = v_scale / f1;
   3131 
   3132             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
   3133             v_int16x8 res = v_pack(i0, i1);
   3134 
   3135             res = v_select(v_src2 == v_zero, v_zero, res);
   3136             v_pack_store(dst + x, res);
   3137         }
   3138 
   3139         return x;
   3140     }
   3141 };
   3142 
   3143 
   3144 template <>
   3145 struct Recip_SIMD<ushort>
   3146 {
   3147     bool haveSIMD;
   3148     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   3149 
   3150     int operator() (const ushort * src2, ushort * dst, int width, double scale) const
   3151     {
   3152         int x = 0;
   3153 
   3154         if (!haveSIMD)
   3155             return x;
   3156 
   3157         v_float32x4 v_scale = v_setall_f32((float)scale);
   3158         v_uint16x8 v_zero = v_setzero_u16();
   3159 
   3160         for ( ; x <= width - 8; x += 8)
   3161         {
   3162             v_uint16x8 v_src2 = v_load(src2 + x);
   3163 
   3164             v_uint32x4 t0, t1;
   3165             v_expand(v_src2, t0, t1);
   3166 
   3167             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
   3168             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
   3169 
   3170             f0 = v_scale / f0;
   3171             f1 = v_scale / f1;
   3172 
   3173             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
   3174             v_uint16x8 res = v_pack_u(i0, i1);
   3175 
   3176             res = v_select(v_src2 == v_zero, v_zero, res);
   3177             v_store(dst + x, res);
   3178         }
   3179 
   3180         return x;
   3181     }
   3182 };
   3183 
   3184 template <>
   3185 struct Recip_SIMD<short>
   3186 {
   3187     bool haveSIMD;
   3188     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   3189 
   3190     int operator() (const short * src2, short * dst, int width, double scale) const
   3191     {
   3192         int x = 0;
   3193 
   3194         if (!haveSIMD)
   3195             return x;
   3196 
   3197         v_float32x4 v_scale = v_setall_f32((float)scale);
   3198         v_int16x8 v_zero = v_setzero_s16();
   3199 
   3200         for ( ; x <= width - 8; x += 8)
   3201         {
   3202             v_int16x8 v_src2 = v_load(src2 + x);
   3203 
   3204             v_int32x4 t0, t1;
   3205             v_expand(v_src2, t0, t1);
   3206 
   3207             v_float32x4 f0 = v_cvt_f32(t0);
   3208             v_float32x4 f1 = v_cvt_f32(t1);
   3209 
   3210             f0 = v_scale / f0;
   3211             f1 = v_scale / f1;
   3212 
   3213             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
   3214             v_int16x8 res = v_pack(i0, i1);
   3215 
   3216             res = v_select(v_src2 == v_zero, v_zero, res);
   3217             v_store(dst + x, res);
   3218         }
   3219 
   3220         return x;
   3221     }
   3222 };
   3223 
   3224 template <>
   3225 struct Recip_SIMD<int>
   3226 {
   3227     bool haveSIMD;
   3228     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   3229 
   3230     int operator() (const int * src2, int * dst, int width, double scale) const
   3231     {
   3232         int x = 0;
   3233 
   3234         if (!haveSIMD)
   3235             return x;
   3236 
   3237         v_float32x4 v_scale = v_setall_f32((float)scale);
   3238         v_int32x4 v_zero = v_setzero_s32();
   3239 
   3240         for ( ; x <= width - 8; x += 8)
   3241         {
   3242             v_int32x4 t0 = v_load(src2 + x);
   3243             v_int32x4 t1 = v_load(src2 + x + 4);
   3244 
   3245             v_float32x4 f0 = v_cvt_f32(t0);
   3246             v_float32x4 f1 = v_cvt_f32(t1);
   3247 
   3248             f0 = v_scale / f0;
   3249             f1 = v_scale / f1;
   3250 
   3251             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
   3252 
   3253             res0 = v_select(t0 == v_zero, v_zero, res0);
   3254             res1 = v_select(t1 == v_zero, v_zero, res1);
   3255             v_store(dst + x, res0);
   3256             v_store(dst + x + 4, res1);
   3257         }
   3258 
   3259         return x;
   3260     }
   3261 };
   3262 
   3263 
   3264 template <>
   3265 struct Recip_SIMD<float>
   3266 {
   3267     bool haveSIMD;
   3268     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   3269 
   3270     int operator() (const float * src2, float * dst, int width, double scale) const
   3271     {
   3272         int x = 0;
   3273 
   3274         if (!haveSIMD)
   3275             return x;
   3276 
   3277         v_float32x4 v_scale = v_setall_f32((float)scale);
   3278         v_float32x4 v_zero = v_setzero_f32();
   3279 
   3280         for ( ; x <= width - 8; x += 8)
   3281         {
   3282             v_float32x4 f0 = v_load(src2 + x);
   3283             v_float32x4 f1 = v_load(src2 + x + 4);
   3284 
   3285             v_float32x4 res0 = v_scale / f0;
   3286             v_float32x4 res1 = v_scale / f1;
   3287 
   3288             res0 = v_select(f0 == v_zero, v_zero, res0);
   3289             res1 = v_select(f1 == v_zero, v_zero, res1);
   3290 
   3291             v_store(dst + x, res0);
   3292             v_store(dst + x + 4, res1);
   3293         }
   3294 
   3295         return x;
   3296     }
   3297 };
   3298 
   3299 #if CV_SIMD128_64F
   3300 
   3301 template <>
   3302 struct Div_SIMD<double>
   3303 {
   3304     bool haveSIMD;
   3305     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   3306 
   3307     int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
   3308     {
   3309         int x = 0;
   3310 
   3311         if (!haveSIMD)
   3312             return x;
   3313 
   3314         v_float64x2 v_scale = v_setall_f64(scale);
   3315         v_float64x2 v_zero = v_setzero_f64();
   3316 
   3317         for ( ; x <= width - 4; x += 4)
   3318         {
   3319             v_float64x2 f0 = v_load(src1 + x);
   3320             v_float64x2 f1 = v_load(src1 + x + 2);
   3321             v_float64x2 f2 = v_load(src2 + x);
   3322             v_float64x2 f3 = v_load(src2 + x + 2);
   3323 
   3324             v_float64x2 res0 = f0 * v_scale / f2;
   3325             v_float64x2 res1 = f1 * v_scale / f3;
   3326 
   3327             res0 = v_select(f0 == v_zero, v_zero, res0);
   3328             res1 = v_select(f1 == v_zero, v_zero, res1);
   3329 
   3330             v_store(dst + x, res0);
   3331             v_store(dst + x + 2, res1);
   3332         }
   3333 
   3334         return x;
   3335     }
   3336 };
   3337 
   3338 template <>
   3339 struct Recip_SIMD<double>
   3340 {
   3341     bool haveSIMD;
   3342     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
   3343 
   3344     int operator() (const double * src2, double * dst, int width, double scale) const
   3345     {
   3346         int x = 0;
   3347 
   3348         if (!haveSIMD)
   3349             return x;
   3350 
   3351         v_float64x2 v_scale = v_setall_f64(scale);
   3352         v_float64x2 v_zero = v_setzero_f64();
   3353 
   3354         for ( ; x <= width - 4; x += 4)
   3355         {
   3356             v_float64x2 f0 = v_load(src2 + x);
   3357             v_float64x2 f1 = v_load(src2 + x + 2);
   3358 
   3359             v_float64x2 res0 = v_scale / f0;
   3360             v_float64x2 res1 = v_scale / f1;
   3361 
   3362             res0 = v_select(f0 == v_zero, v_zero, res0);
   3363             res1 = v_select(f1 == v_zero, v_zero, res1);
   3364 
   3365             v_store(dst + x, res0);
   3366             v_store(dst + x + 2, res1);
   3367         }
   3368 
   3369         return x;
   3370     }
   3371 };
   3372 
   3373 #endif
   3374 
   3375 #endif
   3376 
   3377 template<typename T> static void
   3378 div_i( const T* src1, size_t step1, const T* src2, size_t step2,
   3379       T* dst, size_t step, Size size, double scale )
   3380 {
   3381     step1 /= sizeof(src1[0]);
   3382     step2 /= sizeof(src2[0]);
   3383     step /= sizeof(dst[0]);
   3384 
   3385     Div_SIMD<T> vop;
   3386     float scale_f = (float)scale;
   3387 
   3388     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   3389     {
   3390         int i = vop(src1, src2, dst, size.width, scale);
   3391         for( ; i < size.width; i++ )
   3392         {
   3393             T num = src1[i], denom = src2[i];
   3394             dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
   3395         }
   3396     }
   3397 }
   3398 
   3399 template<typename T> static void
   3400 div_f( const T* src1, size_t step1, const T* src2, size_t step2,
   3401       T* dst, size_t step, Size size, double scale )
   3402 {
   3403     T scale_f = (T)scale;
   3404     step1 /= sizeof(src1[0]);
   3405     step2 /= sizeof(src2[0]);
   3406     step /= sizeof(dst[0]);
   3407 
   3408     Div_SIMD<T> vop;
   3409 
   3410     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   3411     {
   3412         int i = vop(src1, src2, dst, size.width, scale);
   3413         for( ; i < size.width; i++ )
   3414         {
   3415             T num = src1[i], denom = src2[i];
   3416             dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
   3417         }
   3418     }
   3419 }
   3420 
   3421 template<typename T> static void
   3422 recip_i( const T*, size_t, const T* src2, size_t step2,
   3423          T* dst, size_t step, Size size, double scale )
   3424 {
   3425     step2 /= sizeof(src2[0]);
   3426     step /= sizeof(dst[0]);
   3427 
   3428     Recip_SIMD<T> vop;
   3429     float scale_f = (float)scale;
   3430 
   3431     for( ; size.height--; src2 += step2, dst += step )
   3432     {
   3433         int i = vop(src2, dst, size.width, scale);
   3434         for( ; i < size.width; i++ )
   3435         {
   3436             T denom = src2[i];
   3437             dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
   3438         }
   3439     }
   3440 }
   3441 
   3442 template<typename T> static void
   3443 recip_f( const T*, size_t, const T* src2, size_t step2,
   3444          T* dst, size_t step, Size size, double scale )
   3445 {
   3446     T scale_f = (T)scale;
   3447     step2 /= sizeof(src2[0]);
   3448     step /= sizeof(dst[0]);
   3449 
   3450     Recip_SIMD<T> vop;
   3451 
   3452     for( ; size.height--; src2 += step2, dst += step )
   3453     {
   3454         int i = vop(src2, dst, size.width, scale);
   3455         for( ; i < size.width; i++ )
   3456         {
   3457             T denom = src2[i];
   3458             dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
   3459         }
   3460     }
   3461 }
   3462 
   3463 
   3464 static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
   3465                    uchar* dst, size_t step, Size sz, void* scale)
   3466 {
   3467     float fscale = (float)*(const double*)scale;
   3468 #if defined HAVE_IPP
   3469     CV_IPP_CHECK()
   3470     {
   3471         if (std::fabs(fscale - 1) <= FLT_EPSILON)
   3472         {
   3473             if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
   3474             {
   3475                 CV_IMPL_ADD(CV_IMPL_IPP);
   3476                 return;
   3477             }
   3478             setIppErrorStatus();
   3479         }
   3480     }
   3481 #endif
   3482     mul_(src1, step1, src2, step2, dst, step, sz, fscale);
   3483 }
   3484 
   3485 static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
   3486                    schar* dst, size_t step, Size sz, void* scale)
   3487 {
   3488     mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
   3489 }
   3490 
   3491 static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
   3492                     ushort* dst, size_t step, Size sz, void* scale)
   3493 {
   3494     float fscale = (float)*(const double*)scale;
   3495 #if defined HAVE_IPP
   3496     CV_IPP_CHECK()
   3497     {
   3498         if (std::fabs(fscale - 1) <= FLT_EPSILON)
   3499         {
   3500             if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
   3501             {
   3502                 CV_IMPL_ADD(CV_IMPL_IPP);
   3503                 return;
   3504             }
   3505             setIppErrorStatus();
   3506         }
   3507     }
   3508 #endif
   3509     mul_(src1, step1, src2, step2, dst, step, sz, fscale);
   3510 }
   3511 
   3512 static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
   3513                     short* dst, size_t step, Size sz, void* scale)
   3514 {
   3515     float fscale = (float)*(const double*)scale;
   3516 #if defined HAVE_IPP
   3517     CV_IPP_CHECK()
   3518     {
   3519         if (std::fabs(fscale - 1) <= FLT_EPSILON)
   3520         {
   3521             if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
   3522             {
   3523                 CV_IMPL_ADD(CV_IMPL_IPP);
   3524                 return;
   3525             }
   3526             setIppErrorStatus();
   3527         }
   3528     }
   3529 #endif
   3530     mul_(src1, step1, src2, step2, dst, step, sz, fscale);
   3531 }
   3532 
   3533 static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
   3534                     int* dst, size_t step, Size sz, void* scale)
   3535 {
   3536     mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3537 }
   3538 
   3539 static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
   3540                     float* dst, size_t step, Size sz, void* scale)
   3541 {
   3542     float fscale = (float)*(const double*)scale;
   3543 #if defined HAVE_IPP
   3544     CV_IPP_CHECK()
   3545     {
   3546         if (std::fabs(fscale - 1) <= FLT_EPSILON)
   3547         {
   3548             if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0)
   3549             {
   3550                 CV_IMPL_ADD(CV_IMPL_IPP);
   3551                 return;
   3552             }
   3553             setIppErrorStatus();
   3554         }
   3555     }
   3556 #endif
   3557     mul_(src1, step1, src2, step2, dst, step, sz, fscale);
   3558 }
   3559 
   3560 static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
   3561                     double* dst, size_t step, Size sz, void* scale)
   3562 {
   3563     mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3564 }
   3565 
   3566 static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
   3567                    uchar* dst, size_t step, Size sz, void* scale)
   3568 {
   3569     if( src1 )
   3570         div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3571     else
   3572         recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3573 }
   3574 
   3575 static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
   3576                   schar* dst, size_t step, Size sz, void* scale)
   3577 {
   3578     div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3579 }
   3580 
   3581 static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
   3582                     ushort* dst, size_t step, Size sz, void* scale)
   3583 {
   3584     div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3585 }
   3586 
   3587 static void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
   3588                     short* dst, size_t step, Size sz, void* scale)
   3589 {
   3590     div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3591 }
   3592 
   3593 static void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
   3594                     int* dst, size_t step, Size sz, void* scale)
   3595 {
   3596     div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3597 }
   3598 
   3599 static void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
   3600                     float* dst, size_t step, Size sz, void* scale)
   3601 {
   3602     div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3603 }
   3604 
   3605 static void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
   3606                     double* dst, size_t step, Size sz, void* scale)
   3607 {
   3608     div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3609 }
   3610 
   3611 static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
   3612                   uchar* dst, size_t step, Size sz, void* scale)
   3613 {
   3614     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3615 }
   3616 
   3617 static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
   3618                   schar* dst, size_t step, Size sz, void* scale)
   3619 {
   3620     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3621 }
   3622 
   3623 static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
   3624                    ushort* dst, size_t step, Size sz, void* scale)
   3625 {
   3626     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3627 }
   3628 
   3629 static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
   3630                    short* dst, size_t step, Size sz, void* scale)
   3631 {
   3632     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3633 }
   3634 
   3635 static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
   3636                    int* dst, size_t step, Size sz, void* scale)
   3637 {
   3638     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3639 }
   3640 
   3641 static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
   3642                    float* dst, size_t step, Size sz, void* scale)
   3643 {
   3644     recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3645 }
   3646 
   3647 static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
   3648                    double* dst, size_t step, Size sz, void* scale)
   3649 {
   3650     recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
   3651 }
   3652 
   3653 
   3654 static BinaryFunc* getMulTab()
   3655 {
   3656     static BinaryFunc mulTab[] =
   3657     {
   3658         (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u,
   3659         (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f,
   3660         (BinaryFunc)mul64f, 0
   3661     };
   3662 
   3663     return mulTab;
   3664 }
   3665 
   3666 static BinaryFunc* getDivTab()
   3667 {
   3668     static BinaryFunc divTab[] =
   3669     {
   3670         (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u,
   3671         (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f,
   3672         (BinaryFunc)div64f, 0
   3673     };
   3674 
   3675     return divTab;
   3676 }
   3677 
   3678 static BinaryFunc* getRecipTab()
   3679 {
   3680     static BinaryFunc recipTab[] =
   3681     {
   3682         (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u,
   3683         (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f,
   3684         (BinaryFunc)recip64f, 0
   3685     };
   3686 
   3687     return recipTab;
   3688 }
   3689 
   3690 }
   3691 
   3692 void cv::multiply(InputArray src1, InputArray src2,
   3693                   OutputArray dst, double scale, int dtype)
   3694 {
   3695     arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
   3696               true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
   3697 }
   3698 
   3699 void cv::divide(InputArray src1, InputArray src2,
   3700                 OutputArray dst, double scale, int dtype)
   3701 {
   3702     arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
   3703 }
   3704 
   3705 void cv::divide(double scale, InputArray src2,
   3706                 OutputArray dst, int dtype)
   3707 {
   3708     arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
   3709 }
   3710 
   3711 /****************************************************************************************\
   3712 *                                      addWeighted                                       *
   3713 \****************************************************************************************/
   3714 
   3715 namespace cv
   3716 {
   3717 
   3718 template <typename T, typename WT>
   3719 struct AddWeighted_SIMD
   3720 {
   3721     int operator() (const T *, const T *, T *, int, WT, WT, WT) const
   3722     {
   3723         return 0;
   3724     }
   3725 };
   3726 
   3727 #if CV_SSE2
   3728 
   3729 template <>
   3730 struct AddWeighted_SIMD<schar, float>
   3731 {
   3732     AddWeighted_SIMD()
   3733     {
   3734         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
   3735     }
   3736 
   3737     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
   3738     {
   3739         int x = 0;
   3740 
   3741         if (!haveSSE2)
   3742             return x;
   3743 
   3744         __m128i v_zero = _mm_setzero_si128();
   3745         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
   3746                v_gamma = _mm_set1_ps(gamma);
   3747 
   3748         for( ; x <= width - 8; x += 8 )
   3749         {
   3750             __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
   3751             __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
   3752 
   3753             __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
   3754             __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
   3755 
   3756             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
   3757             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
   3758                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
   3759 
   3760             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
   3761             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
   3762                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
   3763 
   3764             __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
   3765                                               _mm_cvtps_epi32(v_dstf1));
   3766 
   3767             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
   3768         }
   3769 
   3770         return x;
   3771     }
   3772 
   3773     bool haveSSE2;
   3774 };
   3775 
   3776 template <>
   3777 struct AddWeighted_SIMD<short, float>
   3778 {
   3779     AddWeighted_SIMD()
   3780     {
   3781         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
   3782     }
   3783 
   3784     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
   3785     {
   3786         int x = 0;
   3787 
   3788         if (!haveSSE2)
   3789             return x;
   3790 
   3791         __m128i v_zero = _mm_setzero_si128();
   3792         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
   3793                v_gamma = _mm_set1_ps(gamma);
   3794 
   3795         for( ; x <= width - 8; x += 8 )
   3796         {
   3797             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
   3798             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
   3799 
   3800             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
   3801             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
   3802                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
   3803 
   3804             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
   3805             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
   3806                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
   3807 
   3808             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
   3809                                                                    _mm_cvtps_epi32(v_dstf1)));
   3810         }
   3811 
   3812         return x;
   3813     }
   3814 
   3815     bool haveSSE2;
   3816 };
   3817 
   3818 #if CV_SSE4_1
   3819 
   3820 template <>
   3821 struct AddWeighted_SIMD<ushort, float>
   3822 {
   3823     AddWeighted_SIMD()
   3824     {
   3825         haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
   3826     }
   3827 
   3828     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
   3829     {
   3830         int x = 0;
   3831 
   3832         if (!haveSSE4_1)
   3833             return x;
   3834 
   3835         __m128i v_zero = _mm_setzero_si128();
   3836         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
   3837                v_gamma = _mm_set1_ps(gamma);
   3838 
   3839         for( ; x <= width - 8; x += 8 )
   3840         {
   3841             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
   3842             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
   3843 
   3844             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
   3845             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
   3846                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
   3847 
   3848             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
   3849             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
   3850                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
   3851 
   3852             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
   3853                                                                     _mm_cvtps_epi32(v_dstf1)));
   3854         }
   3855 
   3856         return x;
   3857     }
   3858 
   3859     bool haveSSE4_1;
   3860 };
   3861 
   3862 #endif
   3863 
   3864 #elif CV_NEON
   3865 
   3866 template <>
   3867 struct AddWeighted_SIMD<schar, float>
   3868 {
   3869     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
   3870     {
   3871         int x = 0;
   3872 
   3873         float32x4_t g = vdupq_n_f32 (gamma);
   3874 
   3875         for( ; x <= width - 8; x += 8 )
   3876         {
   3877             int8x8_t in1 = vld1_s8(src1 + x);
   3878             int16x8_t in1_16 = vmovl_s8(in1);
   3879             float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
   3880             float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
   3881 
   3882             int8x8_t in2 = vld1_s8(src2+x);
   3883             int16x8_t in2_16 = vmovl_s8(in2);
   3884             float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
   3885             float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
   3886 
   3887             float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
   3888             float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
   3889             out_f_l = vaddq_f32(out_f_l, g);
   3890             out_f_h = vaddq_f32(out_f_h, g);
   3891 
   3892             int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
   3893             int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
   3894 
   3895             int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
   3896             int8x8_t out = vqmovn_s16(out_16);
   3897 
   3898             vst1_s8(dst + x, out);
   3899         }
   3900 
   3901         return x;
   3902     }
   3903 };
   3904 
   3905 template <>
   3906 struct AddWeighted_SIMD<ushort, float>
   3907 {
   3908     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
   3909     {
   3910         int x = 0;
   3911 
   3912         float32x4_t g = vdupq_n_f32(gamma);
   3913 
   3914         for( ; x <= width - 8; x += 8 )
   3915         {
   3916             uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
   3917 
   3918             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
   3919             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
   3920             uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
   3921 
   3922             v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
   3923             v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
   3924             uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
   3925 
   3926             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
   3927         }
   3928 
   3929         return x;
   3930     }
   3931 };
   3932 
   3933 template <>
   3934 struct AddWeighted_SIMD<short, float>
   3935 {
   3936     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
   3937     {
   3938         int x = 0;
   3939 
   3940         float32x4_t g = vdupq_n_f32(gamma);
   3941 
   3942         for( ; x <= width - 8; x += 8 )
   3943         {
   3944             int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
   3945 
   3946             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
   3947             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
   3948             int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
   3949 
   3950             v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
   3951             v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
   3952             int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
   3953 
   3954             vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
   3955         }
   3956 
   3957         return x;
   3958     }
   3959 };
   3960 
   3961 #endif
   3962 
   3963 template<typename T, typename WT> static void
   3964 addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
   3965               T* dst, size_t step, Size size, void* _scalars )
   3966 {
   3967     const double* scalars = (const double*)_scalars;
   3968     WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
   3969     step1 /= sizeof(src1[0]);
   3970     step2 /= sizeof(src2[0]);
   3971     step /= sizeof(dst[0]);
   3972 
   3973     AddWeighted_SIMD<T, WT> vop;
   3974 
   3975     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   3976     {
   3977         int x = vop(src1, src2, dst, size.width, alpha, beta, gamma);
   3978         #if CV_ENABLE_UNROLLED
   3979         for( ; x <= size.width - 4; x += 4 )
   3980         {
   3981             T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
   3982             T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
   3983             dst[x] = t0; dst[x+1] = t1;
   3984 
   3985             t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
   3986             t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
   3987             dst[x+2] = t0; dst[x+3] = t1;
   3988         }
   3989         #endif
   3990         for( ; x < size.width; x++ )
   3991             dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
   3992     }
   3993 }
   3994 
   3995 
   3996 static void
   3997 addWeighted8u( const uchar* src1, size_t step1,
   3998                const uchar* src2, size_t step2,
   3999                uchar* dst, size_t step, Size size,
   4000                void* _scalars )
   4001 {
   4002     const double* scalars = (const double*)_scalars;
   4003     float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2];
   4004 
   4005     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   4006     {
   4007         int x = 0;
   4008 
   4009 #if CV_SSE2
   4010         if( USE_SSE2 )
   4011         {
   4012             __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
   4013             __m128i z = _mm_setzero_si128();
   4014 
   4015             for( ; x <= size.width - 8; x += 8 )
   4016             {
   4017                 __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
   4018                 __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
   4019 
   4020                 __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
   4021                 __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
   4022                 __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
   4023                 __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
   4024 
   4025                 u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
   4026                 u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
   4027                 u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
   4028 
   4029                 u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
   4030                 u = _mm_packus_epi16(u, u);
   4031 
   4032                 _mm_storel_epi64((__m128i*)(dst + x), u);
   4033             }
   4034         }
   4035 #elif CV_NEON
   4036         float32x4_t g = vdupq_n_f32 (gamma);
   4037 
   4038         for( ; x <= size.width - 8; x += 8 )
   4039         {
   4040             uint8x8_t in1 = vld1_u8(src1+x);
   4041             uint16x8_t in1_16 = vmovl_u8(in1);
   4042             float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16)));
   4043             float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16)));
   4044 
   4045             uint8x8_t in2 = vld1_u8(src2+x);
   4046             uint16x8_t in2_16 = vmovl_u8(in2);
   4047             float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16)));
   4048             float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16)));
   4049 
   4050             float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
   4051             float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
   4052             out_f_l = vaddq_f32(out_f_l, g);
   4053             out_f_h = vaddq_f32(out_f_h, g);
   4054 
   4055             uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l));
   4056             uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h));
   4057 
   4058             uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
   4059             uint8x8_t out = vqmovn_u16(out_16);
   4060 
   4061             vst1_u8(dst+x, out);
   4062         }
   4063 #endif
   4064         #if CV_ENABLE_UNROLLED
   4065         for( ; x <= size.width - 4; x += 4 )
   4066         {
   4067             float t0, t1;
   4068             t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
   4069             t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
   4070 
   4071             dst[x] = saturate_cast<uchar>(t0);
   4072             dst[x+1] = saturate_cast<uchar>(t1);
   4073 
   4074             t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
   4075             t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
   4076 
   4077             dst[x+2] = saturate_cast<uchar>(t0);
   4078             dst[x+3] = saturate_cast<uchar>(t1);
   4079         }
   4080         #endif
   4081 
   4082         for( ; x < size.width; x++ )
   4083         {
   4084             float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
   4085             dst[x] = saturate_cast<uchar>(t0);
   4086         }
   4087     }
   4088 }
   4089 
   4090 static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
   4091                            schar* dst, size_t step, Size sz, void* scalars )
   4092 {
   4093     addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, sz, scalars);
   4094 }
   4095 
   4096 static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
   4097                             ushort* dst, size_t step, Size sz, void* scalars )
   4098 {
   4099     addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, sz, scalars);
   4100 }
   4101 
   4102 static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
   4103                             short* dst, size_t step, Size sz, void* scalars )
   4104 {
   4105     addWeighted_<short, float>(src1, step1, src2, step2, dst, step, sz, scalars);
   4106 }
   4107 
   4108 static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
   4109                             int* dst, size_t step, Size sz, void* scalars )
   4110 {
   4111     addWeighted_<int, double>(src1, step1, src2, step2, dst, step, sz, scalars);
   4112 }
   4113 
   4114 static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
   4115                             float* dst, size_t step, Size sz, void* scalars )
   4116 {
   4117     addWeighted_<float, double>(src1, step1, src2, step2, dst, step, sz, scalars);
   4118 }
   4119 
   4120 static void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
   4121                             double* dst, size_t step, Size sz, void* scalars )
   4122 {
   4123     addWeighted_<double, double>(src1, step1, src2, step2, dst, step, sz, scalars);
   4124 }
   4125 
   4126 static BinaryFunc* getAddWeightedTab()
   4127 {
   4128     static BinaryFunc addWeightedTab[] =
   4129     {
   4130         (BinaryFunc)GET_OPTIMIZED(addWeighted8u), (BinaryFunc)GET_OPTIMIZED(addWeighted8s), (BinaryFunc)GET_OPTIMIZED(addWeighted16u),
   4131         (BinaryFunc)GET_OPTIMIZED(addWeighted16s), (BinaryFunc)GET_OPTIMIZED(addWeighted32s), (BinaryFunc)addWeighted32f,
   4132         (BinaryFunc)addWeighted64f, 0
   4133     };
   4134 
   4135     return addWeightedTab;
   4136 }
   4137 
   4138 }
   4139 
   4140 void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
   4141                       double beta, double gamma, OutputArray dst, int dtype )
   4142 {
   4143     double scalars[] = {alpha, beta, gamma};
   4144     arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
   4145 }
   4146 
   4147 
   4148 /****************************************************************************************\
   4149 *                                          compare                                       *
   4150 \****************************************************************************************/
   4151 
   4152 namespace cv
   4153 {
   4154 
   4155 template <typename T>
   4156 struct Cmp_SIMD
   4157 {
   4158     explicit Cmp_SIMD(int)
   4159     {
   4160     }
   4161 
   4162     int operator () (const T *, const T *, uchar *, int) const
   4163     {
   4164         return 0;
   4165     }
   4166 };
   4167 
   4168 #if CV_NEON
   4169 
   4170 template <>
   4171 struct Cmp_SIMD<schar>
   4172 {
   4173     explicit Cmp_SIMD(int code_) :
   4174         code(code_)
   4175     {
   4176         CV_Assert(code == CMP_GT || code == CMP_LE ||
   4177                   code == CMP_EQ || code == CMP_NE);
   4178 
   4179         v_mask = vdupq_n_u8(255);
   4180     }
   4181 
   4182     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
   4183     {
   4184         int x = 0;
   4185 
   4186         if (code == CMP_GT)
   4187             for ( ; x <= width - 16; x += 16)
   4188                 vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
   4189         else if (code == CMP_LE)
   4190             for ( ; x <= width - 16; x += 16)
   4191                 vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
   4192         else if (code == CMP_EQ)
   4193             for ( ; x <= width - 16; x += 16)
   4194                 vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
   4195         else if (code == CMP_NE)
   4196             for ( ; x <= width - 16; x += 16)
   4197                 vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
   4198 
   4199         return x;
   4200     }
   4201 
   4202     int code;
   4203     uint8x16_t v_mask;
   4204 };
   4205 
   4206 template <>
   4207 struct Cmp_SIMD<ushort>
   4208 {
   4209     explicit Cmp_SIMD(int code_) :
   4210         code(code_)
   4211     {
   4212         CV_Assert(code == CMP_GT || code == CMP_LE ||
   4213                   code == CMP_EQ || code == CMP_NE);
   4214 
   4215         v_mask = vdup_n_u8(255);
   4216     }
   4217 
   4218     int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
   4219     {
   4220         int x = 0;
   4221 
   4222         if (code == CMP_GT)
   4223             for ( ; x <= width - 8; x += 8)
   4224             {
   4225                 uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
   4226                 vst1_u8(dst + x, vmovn_u16(v_dst));
   4227             }
   4228         else if (code == CMP_LE)
   4229             for ( ; x <= width - 8; x += 8)
   4230             {
   4231                 uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
   4232                 vst1_u8(dst + x, vmovn_u16(v_dst));
   4233             }
   4234         else if (code == CMP_EQ)
   4235             for ( ; x <= width - 8; x += 8)
   4236             {
   4237                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
   4238                 vst1_u8(dst + x, vmovn_u16(v_dst));
   4239             }
   4240         else if (code == CMP_NE)
   4241             for ( ; x <= width - 8; x += 8)
   4242             {
   4243                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
   4244                 vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
   4245             }
   4246 
   4247         return x;
   4248     }
   4249 
   4250     int code;
   4251     uint8x8_t v_mask;
   4252 };
   4253 
   4254 template <>
   4255 struct Cmp_SIMD<int>
   4256 {
   4257     explicit Cmp_SIMD(int code_) :
   4258         code(code_)
   4259     {
   4260         CV_Assert(code == CMP_GT || code == CMP_LE ||
   4261                   code == CMP_EQ || code == CMP_NE);
   4262 
   4263         v_mask = vdup_n_u8(255);
   4264     }
   4265 
   4266     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
   4267     {
   4268         int x = 0;
   4269 
   4270         if (code == CMP_GT)
   4271             for ( ; x <= width - 8; x += 8)
   4272             {
   4273                 uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
   4274                 uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
   4275                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
   4276             }
   4277         else if (code == CMP_LE)
   4278             for ( ; x <= width - 8; x += 8)
   4279             {
   4280                 uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
   4281                 uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
   4282                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
   4283             }
   4284         else if (code == CMP_EQ)
   4285             for ( ; x <= width - 8; x += 8)
   4286             {
   4287                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
   4288                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
   4289                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
   4290             }
   4291         else if (code == CMP_NE)
   4292             for ( ; x <= width - 8; x += 8)
   4293             {
   4294                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
   4295                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
   4296                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
   4297                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
   4298             }
   4299 
   4300         return x;
   4301     }
   4302 
   4303     int code;
   4304     uint8x8_t v_mask;
   4305 };
   4306 
   4307 template <>
   4308 struct Cmp_SIMD<float>
   4309 {
   4310     explicit Cmp_SIMD(int code_) :
   4311         code(code_)
   4312     {
   4313         CV_Assert(code == CMP_GT || code == CMP_LE ||
   4314                   code == CMP_EQ || code == CMP_NE);
   4315 
   4316         v_mask = vdup_n_u8(255);
   4317     }
   4318 
   4319     int operator () (const float * src1, const float * src2, uchar * dst, int width) const
   4320     {
   4321         int x = 0;
   4322 
   4323         if (code == CMP_GT)
   4324             for ( ; x <= width - 8; x += 8)
   4325             {
   4326                 uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
   4327                 uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
   4328                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
   4329             }
   4330         else if (code == CMP_LE)
   4331             for ( ; x <= width - 8; x += 8)
   4332             {
   4333                 uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
   4334                 uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
   4335                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
   4336             }
   4337         else if (code == CMP_EQ)
   4338             for ( ; x <= width - 8; x += 8)
   4339             {
   4340                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
   4341                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
   4342                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
   4343             }
   4344         else if (code == CMP_NE)
   4345             for ( ; x <= width - 8; x += 8)
   4346             {
   4347                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
   4348                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
   4349                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
   4350                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
   4351             }
   4352 
   4353         return x;
   4354     }
   4355 
   4356     int code;
   4357     uint8x8_t v_mask;
   4358 };
   4359 
   4360 #elif CV_SSE2
   4361 
   4362 template <>
   4363 struct Cmp_SIMD<schar>
   4364 {
   4365     explicit Cmp_SIMD(int code_) :
   4366         code(code_)
   4367     {
   4368         CV_Assert(code == CMP_GT || code == CMP_LE ||
   4369                   code == CMP_EQ || code == CMP_NE);
   4370 
   4371         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
   4372 
   4373         v_mask = _mm_set1_epi8(-1);
   4374     }
   4375 
   4376     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
   4377     {
   4378         int x = 0;
   4379 
   4380         if (!haveSSE)
   4381             return x;
   4382 
   4383         if (code == CMP_GT)
   4384             for ( ; x <= width - 16; x += 16)
   4385                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
   4386                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
   4387         else if (code == CMP_LE)
   4388             for ( ; x <= width - 16; x += 16)
   4389             {
   4390                 __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
   4391                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
   4392                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
   4393             }
   4394         else if (code == CMP_EQ)
   4395             for ( ; x <= width - 16; x += 16)
   4396                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
   4397                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
   4398         else if (code == CMP_NE)
   4399             for ( ; x <= width - 16; x += 16)
   4400             {
   4401                 __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
   4402                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
   4403                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
   4404             }
   4405 
   4406         return x;
   4407     }
   4408 
   4409     int code;
   4410     __m128i v_mask;
   4411     bool haveSSE;
   4412 };
   4413 
   4414 template <>
   4415 struct Cmp_SIMD<int>
   4416 {
   4417     explicit Cmp_SIMD(int code_) :
   4418         code(code_)
   4419     {
   4420         CV_Assert(code == CMP_GT || code == CMP_LE ||
   4421                   code == CMP_EQ || code == CMP_NE);
   4422 
   4423         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
   4424 
   4425         v_mask = _mm_set1_epi32(0xffffffff);
   4426     }
   4427 
   4428     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
   4429     {
   4430         int x = 0;
   4431 
   4432         if (!haveSSE)
   4433             return x;
   4434 
   4435         if (code == CMP_GT)
   4436             for ( ; x <= width - 8; x += 8)
   4437             {
   4438                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
   4439                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
   4440                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
   4441                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
   4442 
   4443                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
   4444             }
   4445         else if (code == CMP_LE)
   4446             for ( ; x <= width - 8; x += 8)
   4447             {
   4448                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
   4449                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
   4450                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
   4451                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
   4452 
   4453                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
   4454             }
   4455         else if (code == CMP_EQ)
   4456             for ( ; x <= width - 8; x += 8)
   4457             {
   4458                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
   4459                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
   4460                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
   4461                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
   4462 
   4463                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
   4464             }
   4465         else if (code == CMP_NE)
   4466             for ( ; x <= width - 8; x += 8)
   4467             {
   4468                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
   4469                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
   4470                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
   4471                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
   4472 
   4473                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
   4474             }
   4475 
   4476         return x;
   4477     }
   4478 
   4479     int code;
   4480     __m128i v_mask;
   4481     bool haveSSE;
   4482 };
   4483 
   4484 #endif
   4485 
   4486 template<typename T> static void
   4487 cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
   4488      uchar* dst, size_t step, Size size, int code)
   4489 {
   4490     step1 /= sizeof(src1[0]);
   4491     step2 /= sizeof(src2[0]);
   4492     if( code == CMP_GE || code == CMP_LT )
   4493     {
   4494         std::swap(src1, src2);
   4495         std::swap(step1, step2);
   4496         code = code == CMP_GE ? CMP_LE : CMP_GT;
   4497     }
   4498 
   4499     Cmp_SIMD<T> vop(code);
   4500 
   4501     if( code == CMP_GT || code == CMP_LE )
   4502     {
   4503         int m = code == CMP_GT ? 0 : 255;
   4504         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   4505         {
   4506             int x = vop(src1, src2, dst, size.width);
   4507             #if CV_ENABLE_UNROLLED
   4508             for( ; x <= size.width - 4; x += 4 )
   4509             {
   4510                 int t0, t1;
   4511                 t0 = -(src1[x] > src2[x]) ^ m;
   4512                 t1 = -(src1[x+1] > src2[x+1]) ^ m;
   4513                 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
   4514                 t0 = -(src1[x+2] > src2[x+2]) ^ m;
   4515                 t1 = -(src1[x+3] > src2[x+3]) ^ m;
   4516                 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
   4517             }
   4518             #endif
   4519             for( ; x < size.width; x++ )
   4520                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
   4521         }
   4522     }
   4523     else if( code == CMP_EQ || code == CMP_NE )
   4524     {
   4525         int m = code == CMP_EQ ? 0 : 255;
   4526         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   4527         {
   4528             int x = 0;
   4529             #if CV_ENABLE_UNROLLED
   4530             for( ; x <= size.width - 4; x += 4 )
   4531             {
   4532                 int t0, t1;
   4533                 t0 = -(src1[x] == src2[x]) ^ m;
   4534                 t1 = -(src1[x+1] == src2[x+1]) ^ m;
   4535                 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
   4536                 t0 = -(src1[x+2] == src2[x+2]) ^ m;
   4537                 t1 = -(src1[x+3] == src2[x+3]) ^ m;
   4538                 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
   4539             }
   4540             #endif
   4541             for( ; x < size.width; x++ )
   4542                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
   4543         }
   4544     }
   4545 }
   4546 
   4547 #if ARITHM_USE_IPP
   4548 inline static IppCmpOp convert_cmp(int _cmpop)
   4549 {
   4550     return _cmpop == CMP_EQ ? ippCmpEq :
   4551         _cmpop == CMP_GT ? ippCmpGreater :
   4552         _cmpop == CMP_GE ? ippCmpGreaterEq :
   4553         _cmpop == CMP_LT ? ippCmpLess :
   4554         _cmpop == CMP_LE ? ippCmpLessEq :
   4555         (IppCmpOp)-1;
   4556 }
   4557 #endif
   4558 
   4559 static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
   4560                   uchar* dst, size_t step, Size size, void* _cmpop)
   4561 {
   4562 #if ARITHM_USE_IPP
   4563     CV_IPP_CHECK()
   4564     {
   4565         IppCmpOp op = convert_cmp(*(int *)_cmpop);
   4566         if( op  >= 0 )
   4567         {
   4568             fixSteps(size, sizeof(dst[0]), step1, step2, step);
   4569             if (0 <= ippiCompare_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
   4570             {
   4571                 CV_IMPL_ADD(CV_IMPL_IPP);
   4572                 return;
   4573             }
   4574             setIppErrorStatus();
   4575         }
   4576     }
   4577 #endif
   4578   //vz optimized  cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
   4579     int code = *(int*)_cmpop;
   4580     step1 /= sizeof(src1[0]);
   4581     step2 /= sizeof(src2[0]);
   4582     if( code == CMP_GE || code == CMP_LT )
   4583     {
   4584         std::swap(src1, src2);
   4585         std::swap(step1, step2);
   4586         code = code == CMP_GE ? CMP_LE : CMP_GT;
   4587     }
   4588 
   4589     if( code == CMP_GT || code == CMP_LE )
   4590     {
   4591         int m = code == CMP_GT ? 0 : 255;
   4592         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   4593         {
   4594             int x =0;
   4595             #if CV_SSE2
   4596             if( USE_SSE2 )
   4597             {
   4598                 __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
   4599                 __m128i c128 = _mm_set1_epi8 (-128);
   4600                 for( ; x <= size.width - 16; x += 16 )
   4601                 {
   4602                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
   4603                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
   4604                     // no simd for 8u comparison, that's why we need the trick
   4605                     r00 = _mm_sub_epi8(r00,c128);
   4606                     r10 = _mm_sub_epi8(r10,c128);
   4607 
   4608                     r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
   4609                     _mm_storeu_si128((__m128i*)(dst + x),r00);
   4610 
   4611                 }
   4612             }
   4613             #elif CV_NEON
   4614             uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
   4615 
   4616             for( ; x <= size.width - 16; x += 16 )
   4617             {
   4618                 vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
   4619             }
   4620 
   4621            #endif
   4622 
   4623             for( ; x < size.width; x++ ){
   4624                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
   4625             }
   4626         }
   4627     }
   4628     else if( code == CMP_EQ || code == CMP_NE )
   4629     {
   4630         int m = code == CMP_EQ ? 0 : 255;
   4631         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   4632         {
   4633             int x = 0;
   4634             #if CV_SSE2
   4635             if( USE_SSE2 )
   4636             {
   4637                 __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
   4638                 for( ; x <= size.width - 16; x += 16 )
   4639                 {
   4640                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
   4641                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
   4642                     r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
   4643                     _mm_storeu_si128((__m128i*)(dst + x), r00);
   4644                 }
   4645             }
   4646             #elif CV_NEON
   4647             uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
   4648 
   4649             for( ; x <= size.width - 16; x += 16 )
   4650             {
   4651                 vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
   4652             }
   4653            #endif
   4654            for( ; x < size.width; x++ )
   4655                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
   4656         }
   4657     }
   4658 }
   4659 
   4660 static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
   4661                   uchar* dst, size_t step, Size size, void* _cmpop)
   4662 {
   4663     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
   4664 }
   4665 
   4666 static void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
   4667                   uchar* dst, size_t step, Size size, void* _cmpop)
   4668 {
   4669 #if ARITHM_USE_IPP
   4670     CV_IPP_CHECK()
   4671     {
   4672         IppCmpOp op = convert_cmp(*(int *)_cmpop);
   4673         if( op  >= 0 )
   4674         {
   4675             fixSteps(size, sizeof(dst[0]), step1, step2, step);
   4676             if (0 <= ippiCompare_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
   4677             {
   4678                 CV_IMPL_ADD(CV_IMPL_IPP);
   4679                 return;
   4680             }
   4681             setIppErrorStatus();
   4682         }
   4683     }
   4684 #endif
   4685     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
   4686 }
   4687 
   4688 static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
   4689                   uchar* dst, size_t step, Size size, void* _cmpop)
   4690 {
   4691 #if ARITHM_USE_IPP
   4692     CV_IPP_CHECK()
   4693     {
   4694         IppCmpOp op = convert_cmp(*(int *)_cmpop);
   4695         if( op  > 0 )
   4696         {
   4697             fixSteps(size, sizeof(dst[0]), step1, step2, step);
   4698             if (0 <= ippiCompare_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
   4699             {
   4700                 CV_IMPL_ADD(CV_IMPL_IPP);
   4701                 return;
   4702             }
   4703             setIppErrorStatus();
   4704         }
   4705     }
   4706 #endif
   4707    //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
   4708 
   4709     int code = *(int*)_cmpop;
   4710     step1 /= sizeof(src1[0]);
   4711     step2 /= sizeof(src2[0]);
   4712     if( code == CMP_GE || code == CMP_LT )
   4713     {
   4714         std::swap(src1, src2);
   4715         std::swap(step1, step2);
   4716         code = code == CMP_GE ? CMP_LE : CMP_GT;
   4717     }
   4718 
   4719     if( code == CMP_GT || code == CMP_LE )
   4720     {
   4721         int m = code == CMP_GT ? 0 : 255;
   4722         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   4723         {
   4724             int x =0;
   4725             #if CV_SSE2
   4726             if( USE_SSE2)
   4727             {
   4728                 __m128i m128 =  code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
   4729                 for( ; x <= size.width - 16; x += 16 )
   4730                 {
   4731                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
   4732                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
   4733                     r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
   4734                     __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
   4735                     __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
   4736                     r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
   4737                     r11 = _mm_packs_epi16(r00, r01);
   4738                     _mm_storeu_si128((__m128i*)(dst + x), r11);
   4739                 }
   4740                 if( x <= size.width-8)
   4741                 {
   4742                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
   4743                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
   4744                     r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
   4745                     r10 = _mm_packs_epi16(r00, r00);
   4746                     _mm_storel_epi64((__m128i*)(dst + x), r10);
   4747 
   4748                     x += 8;
   4749                 }
   4750             }
   4751             #elif CV_NEON
   4752             uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
   4753 
   4754             for( ; x <= size.width - 16; x += 16 )
   4755             {
   4756                 int16x8_t in1 = vld1q_s16(src1 + x);
   4757                 int16x8_t in2 = vld1q_s16(src2 + x);
   4758                 uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2));
   4759 
   4760                 in1 = vld1q_s16(src1 + x + 8);
   4761                 in2 = vld1q_s16(src2 + x + 8);
   4762                 uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2));
   4763 
   4764                 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
   4765             }
   4766             #endif
   4767 
   4768             for( ; x < size.width; x++ ){
   4769                  dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
   4770             }
   4771         }
   4772     }
   4773     else if( code == CMP_EQ || code == CMP_NE )
   4774     {
   4775         int m = code == CMP_EQ ? 0 : 255;
   4776         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
   4777         {
   4778             int x = 0;
   4779             #if CV_SSE2
   4780             if( USE_SSE2 )
   4781             {
   4782                 __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
   4783                 for( ; x <= size.width - 16; x += 16 )
   4784                 {
   4785                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
   4786                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
   4787                     r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
   4788                     __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
   4789                     __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
   4790                     r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
   4791                     r11 = _mm_packs_epi16(r00, r01);
   4792                     _mm_storeu_si128((__m128i*)(dst + x), r11);
   4793                 }
   4794                 if( x <= size.width - 8)
   4795                 {
   4796                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
   4797                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
   4798                     r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
   4799                     r10 = _mm_packs_epi16(r00, r00);
   4800                     _mm_storel_epi64((__m128i*)(dst + x), r10);
   4801 
   4802                     x += 8;
   4803                 }
   4804             }
   4805             #elif CV_NEON
   4806             uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
   4807 
   4808             for( ; x <= size.width - 16; x += 16 )
   4809             {
   4810                 int16x8_t in1 = vld1q_s16(src1 + x);
   4811                 int16x8_t in2 = vld1q_s16(src2 + x);
   4812                 uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2));
   4813 
   4814                 in1 = vld1q_s16(src1 + x + 8);
   4815                 in2 = vld1q_s16(src2 + x + 8);
   4816                 uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2));
   4817 
   4818                 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
   4819             }
   4820             #endif
   4821             for( ; x < size.width; x++ )
   4822                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
   4823         }
   4824     }
   4825 }
   4826 
   4827 static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
   4828                    uchar* dst, size_t step, Size size, void* _cmpop)
   4829 {
   4830     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
   4831 }
   4832 
   4833 static void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
   4834                   uchar* dst, size_t step, Size size, void* _cmpop)
   4835 {
   4836 #if ARITHM_USE_IPP
   4837     CV_IPP_CHECK()
   4838     {
   4839         IppCmpOp op = convert_cmp(*(int *)_cmpop);
   4840         if( op  >= 0 )
   4841         {
   4842             fixSteps(size, sizeof(dst[0]), step1, step2, step);
   4843             if (0 <= ippiCompare_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
   4844             {
   4845                 CV_IMPL_ADD(CV_IMPL_IPP);
   4846                 return;
   4847             }
   4848             setIppErrorStatus();
   4849         }
   4850     }
   4851 #endif
   4852     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
   4853 }
   4854 
   4855 static void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
   4856                   uchar* dst, size_t step, Size size, void* _cmpop)
   4857 {
   4858     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
   4859 }
   4860 
   4861 static BinaryFunc getCmpFunc(int depth)
   4862 {
   4863     static BinaryFunc cmpTab[] =
   4864     {
   4865         (BinaryFunc)GET_OPTIMIZED(cmp8u), (BinaryFunc)GET_OPTIMIZED(cmp8s),
   4866         (BinaryFunc)GET_OPTIMIZED(cmp16u), (BinaryFunc)GET_OPTIMIZED(cmp16s),
   4867         (BinaryFunc)GET_OPTIMIZED(cmp32s),
   4868         (BinaryFunc)GET_OPTIMIZED(cmp32f), (BinaryFunc)cmp64f,
   4869         0
   4870     };
   4871 
   4872     return cmpTab[depth];
   4873 }
   4874 
   4875 static double getMinVal(int depth)
   4876 {
   4877     static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0};
   4878     return tab[depth];
   4879 }
   4880 
   4881 static double getMaxVal(int depth)
   4882 {
   4883     static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0};
   4884     return tab[depth];
   4885 }
   4886 
   4887 #ifdef HAVE_OPENCL
   4888 
   4889 static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op, bool haveScalar)
   4890 {
   4891     const ocl::Device& dev = ocl::Device::getDefault();
   4892     bool doubleSupport = dev.doubleFPConfig() > 0;
   4893     int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1),
   4894             type2 = _src2.type(), depth2 = CV_MAT_DEPTH(type2);
   4895 
   4896     if (!doubleSupport && depth1 == CV_64F)
   4897         return false;
   4898 
   4899     if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
   4900             return false;
   4901 
   4902     int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
   4903     // Workaround for bug with "?:" operator in AMD OpenCL compiler
   4904     if (depth1 >= CV_16U)
   4905         kercn = 1;
   4906 
   4907     int scalarcn = kercn == 3 ? 4 : kercn;
   4908     const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
   4909     char cvt[40];
   4910 
   4911     String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
   4912                          " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
   4913                          " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s",
   4914                          haveScalar ? "UNARY_OP" : "BINARY_OP",
   4915                          ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
   4916                          ocl::typeToStr(CV_8UC(kercn)), kercn,
   4917                          ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
   4918                          operationMap[op], ocl::typeToStr(depth1),
   4919                          ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
   4920                          ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI,
   4921                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
   4922 
   4923     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
   4924     if (k.empty())
   4925         return false;
   4926 
   4927     UMat src1 = _src1.getUMat();
   4928     Size size = src1.size();
   4929     _dst.create(size, CV_8UC(cn));
   4930     UMat dst = _dst.getUMat();
   4931 
   4932     if (haveScalar)
   4933     {
   4934         size_t esz = CV_ELEM_SIZE1(type1) * scalarcn;
   4935         double buf[4] = { 0, 0, 0, 0 };
   4936         Mat src2 = _src2.getMat();
   4937 
   4938         if( depth1 > CV_32S )
   4939             convertAndUnrollScalar( src2, depth1, (uchar *)buf, kercn );
   4940         else
   4941         {
   4942             double fval = 0;
   4943             getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar *)&fval, 1, Size(1, 1), 0);
   4944             if( fval < getMinVal(depth1) )
   4945                 return dst.setTo(Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0)), true;
   4946 
   4947             if( fval > getMaxVal(depth1) )
   4948                 return dst.setTo(Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0)), true;
   4949 
   4950             int ival = cvRound(fval);
   4951             if( fval != ival )
   4952             {
   4953                 if( op == CMP_LT || op == CMP_GE )
   4954                     ival = cvCeil(fval);
   4955                 else if( op == CMP_LE || op == CMP_GT )
   4956                     ival = cvFloor(fval);
   4957                 else
   4958                     return dst.setTo(Scalar::all(op == CMP_NE ? 255 : 0)), true;
   4959             }
   4960             convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, (uchar *)buf, kercn);
   4961         }
   4962 
   4963         ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
   4964 
   4965         k.args(ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn),
   4966                ocl::KernelArg::WriteOnly(dst, cn, kercn), scalararg);
   4967     }
   4968     else
   4969     {
   4970         UMat src2 = _src2.getUMat();
   4971 
   4972         k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
   4973                ocl::KernelArg::ReadOnlyNoSize(src2),
   4974                ocl::KernelArg::WriteOnly(dst, cn, kercn));
   4975     }
   4976 
   4977     size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
   4978     return k.run(2, globalsize, NULL, false);
   4979 }
   4980 
   4981 #endif
   4982 
   4983 }
   4984 
   4985 void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
   4986 {
   4987     CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
   4988                op == CMP_NE || op == CMP_GE || op == CMP_GT );
   4989 
   4990     bool haveScalar = false;
   4991 
   4992     if ((_src1.isMatx() + _src2.isMatx()) == 1
   4993             || !_src1.sameSize(_src2)
   4994             || _src1.type() != _src2.type())
   4995     {
   4996         if (checkScalar(_src1, _src2.type(), _src1.kind(), _src2.kind()))
   4997         {
   4998             op = op == CMP_LT ? CMP_GT : op == CMP_LE ? CMP_GE :
   4999                 op == CMP_GE ? CMP_LE : op == CMP_GT ? CMP_LT : op;
   5000             // src1 is a scalar; swap it with src2
   5001             compare(_src2, _src1, _dst, op);
   5002             return;
   5003         }
   5004         else if( !checkScalar(_src2, _src1.type(), _src2.kind(), _src1.kind()) )
   5005             CV_Error( CV_StsUnmatchedSizes,
   5006                      "The operation is neither 'array op array' (where arrays have the same size and the same type), "
   5007                      "nor 'array op scalar', nor 'scalar op array'" );
   5008         haveScalar = true;
   5009     }
   5010 
   5011     CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
   5012                ocl_compare(_src1, _src2, _dst, op, haveScalar))
   5013 
   5014     int kind1 = _src1.kind(), kind2 = _src2.kind();
   5015     Mat src1 = _src1.getMat(), src2 = _src2.getMat();
   5016 
   5017     if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() )
   5018     {
   5019         int cn = src1.channels();
   5020         _dst.create(src1.size(), CV_8UC(cn));
   5021         Mat dst = _dst.getMat();
   5022         Size sz = getContinuousSize(src1, src2, dst, src1.channels());
   5023         getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, &op);
   5024         return;
   5025     }
   5026 
   5027     int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth();
   5028 
   5029     _dst.create(src1.dims, src1.size, CV_8UC(cn));
   5030     src1 = src1.reshape(1); src2 = src2.reshape(1);
   5031     Mat dst = _dst.getMat().reshape(1);
   5032 
   5033     size_t esz = src1.elemSize();
   5034     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
   5035     BinaryFunc func = getCmpFunc(depth1);
   5036 
   5037     if( !haveScalar )
   5038     {
   5039         const Mat* arrays[] = { &src1, &src2, &dst, 0 };
   5040         uchar* ptrs[3];
   5041 
   5042         NAryMatIterator it(arrays, ptrs);
   5043         size_t total = it.size;
   5044 
   5045         for( size_t i = 0; i < it.nplanes; i++, ++it )
   5046             func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op );
   5047     }
   5048     else
   5049     {
   5050         const Mat* arrays[] = { &src1, &dst, 0 };
   5051         uchar* ptrs[2];
   5052 
   5053         NAryMatIterator it(arrays, ptrs);
   5054         size_t total = it.size, blocksize = std::min(total, blocksize0);
   5055 
   5056         AutoBuffer<uchar> _buf(blocksize*esz);
   5057         uchar *buf = _buf;
   5058 
   5059         if( depth1 > CV_32S )
   5060             convertAndUnrollScalar( src2, depth1, buf, blocksize );
   5061         else
   5062         {
   5063             double fval=0;
   5064             getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar*)&fval, 1, Size(1,1), 0);
   5065             if( fval < getMinVal(depth1) )
   5066             {
   5067                 dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0);
   5068                 return;
   5069             }
   5070 
   5071             if( fval > getMaxVal(depth1) )
   5072             {
   5073                 dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0);
   5074                 return;
   5075             }
   5076 
   5077             int ival = cvRound(fval);
   5078             if( fval != ival )
   5079             {
   5080                 if( op == CMP_LT || op == CMP_GE )
   5081                     ival = cvCeil(fval);
   5082                 else if( op == CMP_LE || op == CMP_GT )
   5083                     ival = cvFloor(fval);
   5084                 else
   5085                 {
   5086                     dst = Scalar::all(op == CMP_NE ? 255 : 0);
   5087                     return;
   5088                 }
   5089             }
   5090             convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize);
   5091         }
   5092 
   5093         for( size_t i = 0; i < it.nplanes; i++, ++it )
   5094         {
   5095             for( size_t j = 0; j < total; j += blocksize )
   5096             {
   5097                 int bsz = (int)MIN(total - j, blocksize);
   5098                 func( ptrs[0], 0, buf, 0, ptrs[1], 0, Size(bsz, 1), &op);
   5099                 ptrs[0] += bsz*esz;
   5100                 ptrs[1] += bsz;
   5101             }
   5102         }
   5103     }
   5104 }
   5105 
   5106 /****************************************************************************************\
   5107 *                                        inRange                                         *
   5108 \****************************************************************************************/
   5109 
   5110 namespace cv
   5111 {
   5112 
   5113 template <typename T>
   5114 struct InRange_SIMD
   5115 {
   5116     int operator () (const T *, const T *, const T *, uchar *, int) const
   5117     {
   5118         return 0;
   5119     }
   5120 };
   5121 
   5122 #if CV_SSE2
   5123 
   5124 template <>
   5125 struct InRange_SIMD<uchar>
   5126 {
   5127     int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
   5128                      uchar * dst, int len) const
   5129     {
   5130         int x = 0;
   5131 
   5132         if (USE_SSE2)
   5133         {
   5134             __m128i v_full = _mm_set1_epi8(-1), v_128 = _mm_set1_epi8(-128);
   5135 
   5136             for ( ; x <= len - 16; x += 16 )
   5137             {
   5138                 __m128i v_src = _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), v_128);
   5139                 __m128i v_mask1 = _mm_cmpgt_epi8(_mm_add_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_128), v_src);
   5140                 __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src3 + x)), v_128));
   5141                 _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
   5142             }
   5143         }
   5144 
   5145         return x;
   5146     }
   5147 };
   5148 
   5149 template <>
   5150 struct InRange_SIMD<schar>
   5151 {
   5152     int operator () (const schar * src1, const schar * src2, const schar * src3,
   5153                      uchar * dst, int len) const
   5154     {
   5155         int x = 0;
   5156 
   5157         if (USE_SSE2)
   5158         {
   5159             __m128i v_full = _mm_set1_epi8(-1);
   5160 
   5161             for ( ; x <= len - 16; x += 16 )
   5162             {
   5163                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
   5164                 __m128i v_mask1 = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
   5165                 __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
   5166                 _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
   5167             }
   5168         }
   5169 
   5170         return x;
   5171     }
   5172 };
   5173 
   5174 template <>
   5175 struct InRange_SIMD<ushort>
   5176 {
   5177     int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
   5178                      uchar * dst, int len) const
   5179     {
   5180         int x = 0;
   5181 
   5182         if (USE_SSE2)
   5183         {
   5184             __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1), v_32768 = _mm_set1_epi16(-32768);
   5185 
   5186             for ( ; x <= len - 8; x += 8 )
   5187             {
   5188                 __m128i v_src = _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src1 + x)), v_32768);
   5189                 __m128i v_mask1 = _mm_cmpgt_epi16(_mm_add_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_32768), v_src);
   5190                 __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src3 + x)), v_32768));
   5191                 __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
   5192                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
   5193             }
   5194         }
   5195 
   5196         return x;
   5197     }
   5198 };
   5199 
   5200 template <>
   5201 struct InRange_SIMD<short>
   5202 {
   5203     int operator () (const short * src1, const short * src2, const short * src3,
   5204                      uchar * dst, int len) const
   5205     {
   5206         int x = 0;
   5207 
   5208         if (USE_SSE2)
   5209         {
   5210             __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1);
   5211 
   5212             for ( ; x <= len - 8; x += 8 )
   5213             {
   5214                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
   5215                 __m128i v_mask1 = _mm_cmpgt_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
   5216                 __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
   5217                 __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
   5218                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
   5219             }
   5220         }
   5221 
   5222         return x;
   5223     }
   5224 };
   5225 
   5226 template <>
   5227 struct InRange_SIMD<int>
   5228 {
   5229     int operator () (const int * src1, const int * src2, const int * src3,
   5230                      uchar * dst, int len) const
   5231     {
   5232         int x = 0;
   5233 
   5234         if (USE_SSE2)
   5235         {
   5236             __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi32(-1);
   5237 
   5238             for ( ; x <= len - 8; x += 8 )
   5239             {
   5240                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
   5241                 __m128i v_res1 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src),
   5242                     _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))));
   5243 
   5244                 v_src = _mm_loadu_si128((const __m128i *)(src1 + x + 4));
   5245                 __m128i v_res2 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x + 4)), v_src),
   5246                     _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x + 4))));
   5247 
   5248                 __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(_mm_andnot_si128(v_res1, v_full), 16),
   5249                                                 _mm_srli_epi32(_mm_andnot_si128(v_res2, v_full), 16));
   5250                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
   5251             }
   5252         }
   5253 
   5254         return x;
   5255     }
   5256 };
   5257 
   5258 template <>
   5259 struct InRange_SIMD<float>
   5260 {
   5261     int operator () (const float * src1, const float * src2, const float * src3,
   5262                      uchar * dst, int len) const
   5263     {
   5264         int x = 0;
   5265 
   5266         if (USE_SSE2)
   5267         {
   5268             __m128i v_zero = _mm_setzero_si128();
   5269 
   5270             for ( ; x <= len - 8; x += 8 )
   5271             {
   5272                 __m128 v_src = _mm_loadu_ps(src1 + x);
   5273                 __m128 v_res1 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x), v_src),
   5274                     _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x)));
   5275 
   5276                 v_src = _mm_loadu_ps(src1 + x + 4);
   5277                 __m128 v_res2 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x + 4), v_src),
   5278                     _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x + 4)));
   5279 
   5280                 __m128i v_res1i = _mm_cvtps_epi32(v_res1), v_res2i = _mm_cvtps_epi32(v_res2);
   5281                 __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(v_res1i, 16), _mm_srli_epi32(v_res2i, 16));
   5282                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
   5283             }
   5284         }
   5285 
   5286         return x;
   5287     }
   5288 };
   5289 
   5290 #elif CV_NEON
   5291 
   5292 template <>
   5293 struct InRange_SIMD<uchar>
   5294 {
   5295     int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
   5296                      uchar * dst, int len) const
   5297     {
   5298         int x = 0;
   5299 
   5300         for ( ; x <= len - 16; x += 16 )
   5301         {
   5302             uint8x16_t values = vld1q_u8(src1 + x);
   5303             uint8x16_t low = vld1q_u8(src2 + x);
   5304             uint8x16_t high = vld1q_u8(src3 + x);
   5305 
   5306             vst1q_u8(dst + x, vandq_u8(vcgeq_u8(values, low), vcgeq_u8(high, values)));
   5307         }
   5308         return x;
   5309     }
   5310 };
   5311 
   5312 template <>
   5313 struct InRange_SIMD<schar>
   5314 {
   5315     int operator () (const schar * src1, const schar * src2, const schar * src3,
   5316                      uchar * dst, int len) const
   5317     {
   5318         int x = 0;
   5319 
   5320         for ( ; x <= len - 16; x += 16 )
   5321         {
   5322             int8x16_t values = vld1q_s8(src1 + x);
   5323             int8x16_t low = vld1q_s8(src2 + x);
   5324             int8x16_t high = vld1q_s8(src3 + x);
   5325 
   5326             vst1q_u8(dst + x, vandq_u8(vcgeq_s8(values, low), vcgeq_s8(high, values)));
   5327         }
   5328         return x;
   5329     }
   5330 };
   5331 
   5332 template <>
   5333 struct InRange_SIMD<ushort>
   5334 {
   5335     int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
   5336                      uchar * dst, int len) const
   5337     {
   5338         int x = 0;
   5339 
   5340         for ( ; x <= len - 16; x += 16 )
   5341         {
   5342             uint16x8_t values = vld1q_u16((const uint16_t*)(src1 + x));
   5343             uint16x8_t low = vld1q_u16((const uint16_t*)(src2 + x));
   5344             uint16x8_t high = vld1q_u16((const uint16_t*)(src3 + x));
   5345             uint8x8_t  r1 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values)));
   5346 
   5347             values = vld1q_u16((const uint16_t*)(src1 + x + 8));
   5348             low = vld1q_u16((const uint16_t*)(src2 + x + 8));
   5349             high = vld1q_u16((const uint16_t*)(src3 + x + 8));
   5350             uint8x8_t  r2 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values)));
   5351 
   5352             vst1q_u8(dst + x, vcombine_u8(r1, r2));
   5353         }
   5354         return x;
   5355     }
   5356 };
   5357 
   5358 template <>
   5359 struct InRange_SIMD<short>
   5360 {
   5361     int operator () (const short * src1, const short * src2, const short * src3,
   5362                      uchar * dst, int len) const
   5363     {
   5364         int x = 0;
   5365 
   5366         for ( ; x <= len - 16; x += 16 )
   5367         {
   5368             int16x8_t values = vld1q_s16((const int16_t*)(src1 + x));
   5369             int16x8_t low = vld1q_s16((const int16_t*)(src2 + x));
   5370             int16x8_t high = vld1q_s16((const int16_t*)(src3 + x));
   5371             uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values)));
   5372 
   5373             values = vld1q_s16((const int16_t*)(src1 + x + 8));
   5374             low = vld1q_s16((const int16_t*)(src2 + x + 8));
   5375             high = vld1q_s16((const int16_t*)(src3 + x + 8));
   5376             uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values)));
   5377 
   5378             vst1q_u8(dst + x, vcombine_u8(r1, r2));
   5379         }
   5380         return x;
   5381     }
   5382 };
   5383 
   5384 template <>
   5385 struct InRange_SIMD<int>
   5386 {
   5387     int operator () (const int * src1, const int * src2, const int * src3,
   5388                      uchar * dst, int len) const
   5389     {
   5390         int x = 0;
   5391 
   5392         for ( ; x <= len - 8; x += 8 )
   5393         {
   5394             int32x4_t values = vld1q_s32((const int32_t*)(src1 + x));
   5395             int32x4_t low = vld1q_s32((const int32_t*)(src2 + x));
   5396             int32x4_t high = vld1q_s32((const int32_t*)(src3 + x));
   5397 
   5398             uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values)));
   5399 
   5400             values = vld1q_s32((const int32_t*)(src1 + x + 4));
   5401             low = vld1q_s32((const int32_t*)(src2 + x + 4));
   5402             high = vld1q_s32((const int32_t*)(src3 + x + 4));
   5403 
   5404             uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values)));
   5405 
   5406             uint16x8_t res_16 = vcombine_u16(r1, r2);
   5407 
   5408             vst1_u8(dst + x, vmovn_u16(res_16));
   5409         }
   5410         return x;
   5411     }
   5412 };
   5413 
   5414 template <>
   5415 struct InRange_SIMD<float>
   5416 {
   5417     int operator () (const float * src1, const float * src2, const float * src3,
   5418                      uchar * dst, int len) const
   5419     {
   5420         int x = 0;
   5421 
   5422         for ( ; x <= len - 8; x += 8 )
   5423         {
   5424             float32x4_t values = vld1q_f32((const float32_t*)(src1 + x));
   5425             float32x4_t low = vld1q_f32((const float32_t*)(src2 + x));
   5426             float32x4_t high = vld1q_f32((const float32_t*)(src3 + x));
   5427 
   5428             uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values)));
   5429 
   5430             values = vld1q_f32((const float32_t*)(src1 + x + 4));
   5431             low = vld1q_f32((const float32_t*)(src2 + x + 4));
   5432             high = vld1q_f32((const float32_t*)(src3 + x + 4));
   5433 
   5434             uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values)));
   5435 
   5436             uint16x8_t res_16 = vcombine_u16(r1, r2);
   5437 
   5438             vst1_u8(dst + x, vmovn_u16(res_16));
   5439         }
   5440         return x;
   5441     }
   5442 };
   5443 
   5444 #endif
   5445 
   5446 template <typename T>
   5447 static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
   5448          const T* src3, size_t step3, uchar* dst, size_t step,
   5449          Size size)
   5450 {
   5451     step1 /= sizeof(src1[0]);
   5452     step2 /= sizeof(src2[0]);
   5453     step3 /= sizeof(src3[0]);
   5454 
   5455     InRange_SIMD<T> vop;
   5456 
   5457     for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
   5458     {
   5459         int x = vop(src1, src2, src3, dst, size.width);
   5460         #if CV_ENABLE_UNROLLED
   5461         for( ; x <= size.width - 4; x += 4 )
   5462         {
   5463             int t0, t1;
   5464             t0 = src2[x] <= src1[x] && src1[x] <= src3[x];
   5465             t1 = src2[x+1] <= src1[x+1] && src1[x+1] <= src3[x+1];
   5466             dst[x] = (uchar)-t0; dst[x+1] = (uchar)-t1;
   5467             t0 = src2[x+2] <= src1[x+2] && src1[x+2] <= src3[x+2];
   5468             t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
   5469             dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
   5470         }
   5471         #endif
   5472         for( ; x < size.width; x++ )
   5473             dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
   5474     }
   5475 }
   5476 
   5477 
   5478 static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
   5479                       const uchar* src3, size_t step3, uchar* dst, size_t step, Size size)
   5480 {
   5481     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
   5482 }
   5483 
   5484 static void inRange8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
   5485                       const schar* src3, size_t step3, uchar* dst, size_t step, Size size)
   5486 {
   5487     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
   5488 }
   5489 
   5490 static void inRange16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
   5491                        const ushort* src3, size_t step3, uchar* dst, size_t step, Size size)
   5492 {
   5493     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
   5494 }
   5495 
   5496 static void inRange16s(const short* src1, size_t step1, const short* src2, size_t step2,
   5497                        const short* src3, size_t step3, uchar* dst, size_t step, Size size)
   5498 {
   5499     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
   5500 }
   5501 
   5502 static void inRange32s(const int* src1, size_t step1, const int* src2, size_t step2,
   5503                        const int* src3, size_t step3, uchar* dst, size_t step, Size size)
   5504 {
   5505     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
   5506 }
   5507 
   5508 static void inRange32f(const float* src1, size_t step1, const float* src2, size_t step2,
   5509                        const float* src3, size_t step3, uchar* dst, size_t step, Size size)
   5510 {
   5511     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
   5512 }
   5513 
   5514 static void inRange64f(const double* src1, size_t step1, const double* src2, size_t step2,
   5515                        const double* src3, size_t step3, uchar* dst, size_t step, Size size)
   5516 {
   5517     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
   5518 }
   5519 
   5520 static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
   5521 {
   5522     int k = cn % 4 ? cn % 4 : 4;
   5523     size_t i, j;
   5524     if( k == 1 )
   5525         for( i = j = 0; i < len; i++, j += cn )
   5526             dst[i] = src[j];
   5527     else if( k == 2 )
   5528         for( i = j = 0; i < len; i++, j += cn )
   5529             dst[i] = src[j] & src[j+1];
   5530     else if( k == 3 )
   5531         for( i = j = 0; i < len; i++, j += cn )
   5532             dst[i] = src[j] & src[j+1] & src[j+2];
   5533     else
   5534         for( i = j = 0; i < len; i++, j += cn )
   5535             dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3];
   5536 
   5537     for( ; k < cn; k += 4 )
   5538     {
   5539         for( i = 0, j = k; i < len; i++, j += cn )
   5540             dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3];
   5541     }
   5542 }
   5543 
   5544 typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
   5545                              const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz );
   5546 
   5547 static InRangeFunc getInRangeFunc(int depth)
   5548 {
   5549     static InRangeFunc inRangeTab[] =
   5550     {
   5551         (InRangeFunc)GET_OPTIMIZED(inRange8u), (InRangeFunc)GET_OPTIMIZED(inRange8s), (InRangeFunc)GET_OPTIMIZED(inRange16u),
   5552         (InRangeFunc)GET_OPTIMIZED(inRange16s), (InRangeFunc)GET_OPTIMIZED(inRange32s), (InRangeFunc)GET_OPTIMIZED(inRange32f),
   5553         (InRangeFunc)inRange64f, 0
   5554     };
   5555 
   5556     return inRangeTab[depth];
   5557 }
   5558 
   5559 #ifdef HAVE_OPENCL
   5560 
   5561 static bool ocl_inRange( InputArray _src, InputArray _lowerb,
   5562                          InputArray _upperb, OutputArray _dst )
   5563 {
   5564     const ocl::Device & d = ocl::Device::getDefault();
   5565     int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
   5566     Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
   5567     int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
   5568     int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
   5569     int cn = CV_MAT_CN(stype), rowsPerWI = d.isIntel() ? 4 : 1;
   5570     bool lbScalar = false, ubScalar = false;
   5571 
   5572     if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
   5573         ssize != lsize || stype != ltype )
   5574     {
   5575         if( !checkScalar(_lowerb, stype, lkind, skind) )
   5576             CV_Error( CV_StsUnmatchedSizes,
   5577                      "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
   5578         lbScalar = true;
   5579     }
   5580 
   5581     if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
   5582         ssize != usize || stype != utype )
   5583     {
   5584         if( !checkScalar(_upperb, stype, ukind, skind) )
   5585             CV_Error( CV_StsUnmatchedSizes,
   5586                      "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
   5587         ubScalar = true;
   5588     }
   5589 
   5590     if (lbScalar != ubScalar)
   5591         return false;
   5592 
   5593     bool doubleSupport = d.doubleFPConfig() > 0,
   5594             haveScalar = lbScalar && ubScalar;
   5595 
   5596     if ( (!doubleSupport && sdepth == CV_64F) ||
   5597          (!haveScalar && (sdepth != ldepth || sdepth != udepth)) )
   5598         return false;
   5599 
   5600     int kercn = haveScalar ? cn : std::max(std::min(ocl::predictOptimalVectorWidth(_src, _lowerb, _upperb, _dst), 4), cn);
   5601     if (kercn % cn != 0)
   5602         kercn = cn;
   5603     int colsPerWI = kercn / cn;
   5604     String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d",
   5605                            haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)),
   5606                            ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth,
   5607                            doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI);
   5608 
   5609     ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, opts);
   5610     if (ker.empty())
   5611         return false;
   5612 
   5613     _dst.create(ssize, CV_8UC1);
   5614     UMat src = _src.getUMat(), dst = _dst.getUMat(), lscalaru, uscalaru;
   5615     Mat lscalar, uscalar;
   5616 
   5617     if (lbScalar && ubScalar)
   5618     {
   5619         lscalar = _lowerb.getMat();
   5620         uscalar = _upperb.getMat();
   5621 
   5622         size_t esz = src.elemSize();
   5623         size_t blocksize = 36;
   5624 
   5625         AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
   5626         uchar *buf = alignPtr(_buf + blocksize*cn, 16);
   5627 
   5628         if( ldepth != sdepth && sdepth < CV_32S )
   5629         {
   5630             int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
   5631             int* iubuf = ilbuf + cn;
   5632 
   5633             BinaryFunc sccvtfunc = getConvertFunc(ldepth, CV_32S);
   5634             sccvtfunc(lscalar.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
   5635             sccvtfunc(uscalar.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
   5636             int minval = cvRound(getMinVal(sdepth)), maxval = cvRound(getMaxVal(sdepth));
   5637 
   5638             for( int k = 0; k < cn; k++ )
   5639             {
   5640                 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
   5641                     ilbuf[k] = minval+1, iubuf[k] = minval;
   5642             }
   5643             lscalar = Mat(cn, 1, CV_32S, ilbuf);
   5644             uscalar = Mat(cn, 1, CV_32S, iubuf);
   5645         }
   5646 
   5647         lscalar.convertTo(lscalar, stype);
   5648         uscalar.convertTo(uscalar, stype);
   5649     }
   5650     else
   5651     {
   5652         lscalaru = _lowerb.getUMat();
   5653         uscalaru = _upperb.getUMat();
   5654     }
   5655 
   5656     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
   5657             dstarg = ocl::KernelArg::WriteOnly(dst, 1, colsPerWI);
   5658 
   5659     if (haveScalar)
   5660     {
   5661         lscalar.copyTo(lscalaru);
   5662         uscalar.copyTo(uscalaru);
   5663 
   5664         ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
   5665                ocl::KernelArg::PtrReadOnly(uscalaru), rowsPerWI);
   5666     }
   5667     else
   5668         ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
   5669                ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI);
   5670 
   5671     size_t globalsize[2] = { ssize.width / colsPerWI, (ssize.height + rowsPerWI - 1) / rowsPerWI };
   5672     return ker.run(2, globalsize, NULL, false);
   5673 }
   5674 
   5675 #endif
   5676 
   5677 }
   5678 
   5679 void cv::inRange(InputArray _src, InputArray _lowerb,
   5680                  InputArray _upperb, OutputArray _dst)
   5681 {
   5682     CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 &&
   5683                _upperb.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
   5684                ocl_inRange(_src, _lowerb, _upperb, _dst))
   5685 
   5686     int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
   5687     Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat();
   5688 
   5689     bool lbScalar = false, ubScalar = false;
   5690 
   5691     if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
   5692         src.size != lb.size || src.type() != lb.type() )
   5693     {
   5694         if( !checkScalar(lb, src.type(), lkind, skind) )
   5695             CV_Error( CV_StsUnmatchedSizes,
   5696                      "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
   5697         lbScalar = true;
   5698     }
   5699 
   5700     if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
   5701         src.size != ub.size || src.type() != ub.type() )
   5702     {
   5703         if( !checkScalar(ub, src.type(), ukind, skind) )
   5704             CV_Error( CV_StsUnmatchedSizes,
   5705                      "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
   5706         ubScalar = true;
   5707     }
   5708 
   5709     CV_Assert(lbScalar == ubScalar);
   5710 
   5711     int cn = src.channels(), depth = src.depth();
   5712 
   5713     size_t esz = src.elemSize();
   5714     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
   5715 
   5716     _dst.create(src.dims, src.size, CV_8UC1);
   5717     Mat dst = _dst.getMat();
   5718     InRangeFunc func = getInRangeFunc(depth);
   5719 
   5720     const Mat* arrays_sc[] = { &src, &dst, 0 };
   5721     const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 };
   5722     uchar* ptrs[4];
   5723 
   5724     NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs);
   5725     size_t total = it.size, blocksize = std::min(total, blocksize0);
   5726 
   5727     AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
   5728     uchar *buf = _buf, *mbuf = buf, *lbuf = 0, *ubuf = 0;
   5729     buf = alignPtr(buf + blocksize*cn, 16);
   5730 
   5731     if( lbScalar && ubScalar )
   5732     {
   5733         lbuf = buf;
   5734         ubuf = buf = alignPtr(buf + blocksize*esz, 16);
   5735 
   5736         CV_Assert( lb.type() == ub.type() );
   5737         int scdepth = lb.depth();
   5738 
   5739         if( scdepth != depth && depth < CV_32S )
   5740         {
   5741             int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
   5742             int* iubuf = ilbuf + cn;
   5743 
   5744             BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S);
   5745             sccvtfunc(lb.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
   5746             sccvtfunc(ub.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
   5747             int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth));
   5748 
   5749             for( int k = 0; k < cn; k++ )
   5750             {
   5751                 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
   5752                     ilbuf[k] = minval+1, iubuf[k] = minval;
   5753             }
   5754             lb = Mat(cn, 1, CV_32S, ilbuf);
   5755             ub = Mat(cn, 1, CV_32S, iubuf);
   5756         }
   5757 
   5758         convertAndUnrollScalar( lb, src.type(), lbuf, blocksize );
   5759         convertAndUnrollScalar( ub, src.type(), ubuf, blocksize );
   5760     }
   5761 
   5762     for( size_t i = 0; i < it.nplanes; i++, ++it )
   5763     {
   5764         for( size_t j = 0; j < total; j += blocksize )
   5765         {
   5766             int bsz = (int)MIN(total - j, blocksize);
   5767             size_t delta = bsz*esz;
   5768             uchar *lptr = lbuf, *uptr = ubuf;
   5769             if( !lbScalar )
   5770             {
   5771                 lptr = ptrs[2];
   5772                 ptrs[2] += delta;
   5773             }
   5774             if( !ubScalar )
   5775             {
   5776                 int idx = !lbScalar ? 3 : 2;
   5777                 uptr = ptrs[idx];
   5778                 ptrs[idx] += delta;
   5779             }
   5780             func( ptrs[0], 0, lptr, 0, uptr, 0, cn == 1 ? ptrs[1] : mbuf, 0, Size(bsz*cn, 1));
   5781             if( cn > 1 )
   5782                 inRangeReduce(mbuf, ptrs[1], bsz, cn);
   5783             ptrs[0] += delta;
   5784             ptrs[1] += bsz;
   5785         }
   5786     }
   5787 }
   5788 
   5789 /****************************************************************************************\
   5790 *                                Earlier API: cvAdd etc.                                 *
   5791 \****************************************************************************************/
   5792 
   5793 CV_IMPL void
   5794 cvNot( const CvArr* srcarr, CvArr* dstarr )
   5795 {
   5796     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
   5797     CV_Assert( src.size == dst.size && src.type() == dst.type() );
   5798     cv::bitwise_not( src, dst );
   5799 }
   5800 
   5801 
   5802 CV_IMPL void
   5803 cvAnd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
   5804 {
   5805     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
   5806         dst = cv::cvarrToMat(dstarr), mask;
   5807     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
   5808     if( maskarr )
   5809         mask = cv::cvarrToMat(maskarr);
   5810     cv::bitwise_and( src1, src2, dst, mask );
   5811 }
   5812 
   5813 
   5814 CV_IMPL void
   5815 cvOr( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
   5816 {
   5817     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
   5818         dst = cv::cvarrToMat(dstarr), mask;
   5819     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
   5820     if( maskarr )
   5821         mask = cv::cvarrToMat(maskarr);
   5822     cv::bitwise_or( src1, src2, dst, mask );
   5823 }
   5824 
   5825 
   5826 CV_IMPL void
   5827 cvXor( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
   5828 {
   5829     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
   5830         dst = cv::cvarrToMat(dstarr), mask;
   5831     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
   5832     if( maskarr )
   5833         mask = cv::cvarrToMat(maskarr);
   5834     cv::bitwise_xor( src1, src2, dst, mask );
   5835 }
   5836 
   5837 
   5838 CV_IMPL void
   5839 cvAndS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
   5840 {
   5841     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
   5842     CV_Assert( src.size == dst.size && src.type() == dst.type() );
   5843     if( maskarr )
   5844         mask = cv::cvarrToMat(maskarr);
   5845     cv::bitwise_and( src, (const cv::Scalar&)s, dst, mask );
   5846 }
   5847 
   5848 
   5849 CV_IMPL void
   5850 cvOrS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
   5851 {
   5852     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
   5853     CV_Assert( src.size == dst.size && src.type() == dst.type() );
   5854     if( maskarr )
   5855         mask = cv::cvarrToMat(maskarr);
   5856     cv::bitwise_or( src, (const cv::Scalar&)s, dst, mask );
   5857 }
   5858 
   5859 
   5860 CV_IMPL void
   5861 cvXorS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
   5862 {
   5863     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
   5864     CV_Assert( src.size == dst.size && src.type() == dst.type() );
   5865     if( maskarr )
   5866         mask = cv::cvarrToMat(maskarr);
   5867     cv::bitwise_xor( src, (const cv::Scalar&)s, dst, mask );
   5868 }
   5869 
   5870 
   5871 CV_IMPL void cvAdd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
   5872 {
   5873     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
   5874         dst = cv::cvarrToMat(dstarr), mask;
   5875     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
   5876     if( maskarr )
   5877         mask = cv::cvarrToMat(maskarr);
   5878     cv::add( src1, src2, dst, mask, dst.type() );
   5879 }
   5880 
   5881 
   5882 CV_IMPL void cvSub( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
   5883 {
   5884     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
   5885         dst = cv::cvarrToMat(dstarr), mask;
   5886     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
   5887     if( maskarr )
   5888         mask = cv::cvarrToMat(maskarr);
   5889     cv::subtract( src1, src2, dst, mask, dst.type() );
   5890 }
   5891 
   5892 
   5893 CV_IMPL void cvAddS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
   5894 {
   5895     cv::Mat src1 = cv::cvarrToMat(srcarr1),
   5896         dst = cv::cvarrToMat(dstarr), mask;
   5897     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
   5898     if( maskarr )
   5899         mask = cv::cvarrToMat(maskarr);
   5900     cv::add( src1, (const cv::Scalar&)value, dst, mask, dst.type() );
   5901 }
   5902 
   5903 
   5904 CV_IMPL void cvSubRS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
   5905 {
   5906     cv::Mat src1 = cv::cvarrToMat(srcarr1),
   5907         dst = cv::cvarrToMat(dstarr), mask;
   5908     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
   5909     if( maskarr )
   5910         mask = cv::cvarrToMat(maskarr);
   5911     cv::subtract( (const cv::Scalar&)value, src1, dst, mask, dst.type() );
   5912 }
   5913 
   5914 
   5915 CV_IMPL void cvMul( const CvArr* srcarr1, const CvArr* srcarr2,
   5916                     CvArr* dstarr, double scale )
   5917 {
   5918     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
   5919         dst = cv::cvarrToMat(dstarr);
   5920     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
   5921     cv::multiply( src1, src2, dst, scale, dst.type() );
   5922 }
   5923 
   5924 
   5925 CV_IMPL void cvDiv( const CvArr* srcarr1, const CvArr* srcarr2,
   5926                     CvArr* dstarr, double scale )
   5927 {
   5928     cv::Mat src2 = cv::cvarrToMat(srcarr2),
   5929         dst = cv::cvarrToMat(dstarr), mask;
   5930     CV_Assert( src2.size == dst.size && src2.channels() == dst.channels() );
   5931 
   5932     if( srcarr1 )
   5933         cv::divide( cv::cvarrToMat(srcarr1), src2, dst, scale, dst.type() );
   5934     else
   5935         cv::divide( scale, src2, dst, dst.type() );
   5936 }
   5937 
   5938 
   5939 CV_IMPL void
   5940 cvAddWeighted( const CvArr* srcarr1, double alpha,
   5941                const CvArr* srcarr2, double beta,
   5942                double gamma, CvArr* dstarr )
   5943 {
   5944     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
   5945         dst = cv::cvarrToMat(dstarr);
   5946     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
   5947     cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
   5948 }
   5949 
   5950 
   5951 CV_IMPL  void
   5952 cvAbsDiff( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr )
   5953 {
   5954     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   5955     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
   5956 
   5957     cv::absdiff( src1, cv::cvarrToMat(srcarr2), dst );
   5958 }
   5959 
   5960 
   5961 CV_IMPL void
   5962 cvAbsDiffS( const CvArr* srcarr1, CvArr* dstarr, CvScalar scalar )
   5963 {
   5964     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   5965     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
   5966 
   5967     cv::absdiff( src1, (const cv::Scalar&)scalar, dst );
   5968 }
   5969 
   5970 
   5971 CV_IMPL void
   5972 cvInRange( const void* srcarr1, const void* srcarr2,
   5973            const void* srcarr3, void* dstarr )
   5974 {
   5975     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   5976     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
   5977 
   5978     cv::inRange( src1, cv::cvarrToMat(srcarr2), cv::cvarrToMat(srcarr3), dst );
   5979 }
   5980 
   5981 
   5982 CV_IMPL void
   5983 cvInRangeS( const void* srcarr1, CvScalar lowerb, CvScalar upperb, void* dstarr )
   5984 {
   5985     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   5986     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
   5987 
   5988     cv::inRange( src1, (const cv::Scalar&)lowerb, (const cv::Scalar&)upperb, dst );
   5989 }
   5990 
   5991 
   5992 CV_IMPL void
   5993 cvCmp( const void* srcarr1, const void* srcarr2, void* dstarr, int cmp_op )
   5994 {
   5995     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   5996     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
   5997 
   5998     cv::compare( src1, cv::cvarrToMat(srcarr2), dst, cmp_op );
   5999 }
   6000 
   6001 
   6002 CV_IMPL void
   6003 cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op )
   6004 {
   6005     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   6006     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
   6007 
   6008     cv::compare( src1, value, dst, cmp_op );
   6009 }
   6010 
   6011 
   6012 CV_IMPL void
   6013 cvMin( const void* srcarr1, const void* srcarr2, void* dstarr )
   6014 {
   6015     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   6016     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
   6017 
   6018     cv::min( src1, cv::cvarrToMat(srcarr2), dst );
   6019 }
   6020 
   6021 
   6022 CV_IMPL void
   6023 cvMax( const void* srcarr1, const void* srcarr2, void* dstarr )
   6024 {
   6025     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   6026     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
   6027 
   6028     cv::max( src1, cv::cvarrToMat(srcarr2), dst );
   6029 }
   6030 
   6031 
   6032 CV_IMPL void
   6033 cvMinS( const void* srcarr1, double value, void* dstarr )
   6034 {
   6035     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   6036     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
   6037 
   6038     cv::min( src1, value, dst );
   6039 }
   6040 
   6041 
   6042 CV_IMPL void
   6043 cvMaxS( const void* srcarr1, double value, void* dstarr )
   6044 {
   6045     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
   6046     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
   6047 
   6048     cv::max( src1, value, dst );
   6049 }
   6050 
   6051 /* End of file. */
   6052