Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 #include "precomp.hpp"
     45 #include "opencl_kernels_core.hpp"
     46 
     47 #ifdef __APPLE__
     48 #undef CV_NEON
     49 #define CV_NEON 0
     50 #endif
     51 
     52 namespace cv
     53 {
     54 
     55 /****************************************************************************************\
     56 *                                       split & merge                                    *
     57 \****************************************************************************************/
     58 
     59 #if CV_NEON
     60 template<typename T> struct VSplit2;
     61 template<typename T> struct VSplit3;
     62 template<typename T> struct VSplit4;
     63 
     64 #define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
     65     template<>                                                                    \
     66     struct name<data_type>                                                        \
     67     {                                                                             \
     68         void operator()(const data_type* src, data_type* dst0,                    \
     69                         data_type* dst1) const                                    \
     70         {                                                                         \
     71             reg_type r = load_func(src);                                          \
     72             store_func(dst0, r.val[0]);                                           \
     73             store_func(dst1, r.val[1]);                                           \
     74         }                                                                         \
     75     }
     76 
     77 #define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
     78     template<>                                                                    \
     79     struct name<data_type>                                                        \
     80     {                                                                             \
     81         void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
     82                         data_type* dst2) const                                    \
     83         {                                                                         \
     84             reg_type r = load_func(src);                                          \
     85             store_func(dst0, r.val[0]);                                           \
     86             store_func(dst1, r.val[1]);                                           \
     87             store_func(dst2, r.val[2]);                                           \
     88         }                                                                         \
     89     }
     90 
     91 #define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
     92     template<>                                                                    \
     93     struct name<data_type>                                                        \
     94     {                                                                             \
     95         void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
     96                         data_type* dst2, data_type* dst3) const                   \
     97         {                                                                         \
     98             reg_type r = load_func(src);                                          \
     99             store_func(dst0, r.val[0]);                                           \
    100             store_func(dst1, r.val[1]);                                           \
    101             store_func(dst2, r.val[2]);                                           \
    102             store_func(dst3, r.val[3]);                                           \
    103         }                                                                         \
    104     }
    105 
    106 SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
    107 SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
    108 SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
    109 SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
    110 
    111 SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
    112 SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
    113 SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
    114 SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
    115 
    116 SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
    117 SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
    118 SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
    119 SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
    120 
    121 #elif CV_SSE2
    122 
    123 template <typename T>
    124 struct VSplit2
    125 {
    126     VSplit2() : support(false) { }
    127     void operator()(const T *, T *, T *) const { }
    128 
    129     bool support;
    130 };
    131 
    132 template <typename T>
    133 struct VSplit3
    134 {
    135     VSplit3() : support(false) { }
    136     void operator()(const T *, T *, T *, T *) const { }
    137 
    138     bool support;
    139 };
    140 
    141 template <typename T>
    142 struct VSplit4
    143 {
    144     VSplit4() : support(false) { }
    145     void operator()(const T *, T *, T *, T *, T *) const { }
    146 
    147     bool support;
    148 };
    149 
    150 #define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
    151 template <>                                                                                \
    152 struct VSplit2<data_type>                                                                  \
    153 {                                                                                          \
    154     enum                                                                                   \
    155     {                                                                                      \
    156         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
    157     };                                                                                     \
    158                                                                                            \
    159     VSplit2()                                                                              \
    160     {                                                                                      \
    161         support = checkHardwareSupport(CV_CPU_SSE2);                                       \
    162     }                                                                                      \
    163                                                                                            \
    164     void operator()(const data_type * src,                                                 \
    165                     data_type * dst0, data_type * dst1) const                              \
    166     {                                                                                      \
    167         reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
    168         reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
    169         reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
    170         reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
    171                                                                                            \
    172         _mm_deinterleave(v_src0, v_src1, v_src2, v_src3);                                  \
    173                                                                                            \
    174         _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
    175         _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
    176         _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
    177         _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
    178     }                                                                                      \
    179                                                                                            \
    180     bool support;                                                                          \
    181 }
    182 
    183 #define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
    184 template <>                                                                                \
    185 struct VSplit3<data_type>                                                                  \
    186 {                                                                                          \
    187     enum                                                                                   \
    188     {                                                                                      \
    189         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
    190     };                                                                                     \
    191                                                                                            \
    192     VSplit3()                                                                              \
    193     {                                                                                      \
    194         support = checkHardwareSupport(CV_CPU_SSE2);                                       \
    195     }                                                                                      \
    196                                                                                            \
    197     void operator()(const data_type * src,                                                 \
    198                     data_type * dst0, data_type * dst1, data_type * dst2) const            \
    199     {                                                                                      \
    200         reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
    201         reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
    202         reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
    203         reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
    204         reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
    205         reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
    206                                                                                            \
    207         _mm_deinterleave(v_src0, v_src1, v_src2,                                           \
    208                          v_src3, v_src4, v_src5);                                          \
    209                                                                                            \
    210         _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
    211         _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
    212         _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
    213         _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
    214         _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
    215         _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
    216     }                                                                                      \
    217                                                                                            \
    218     bool support;                                                                          \
    219 }
    220 
    221 #define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
    222 template <>                                                                                \
    223 struct VSplit4<data_type>                                                                  \
    224 {                                                                                          \
    225     enum                                                                                   \
    226     {                                                                                      \
    227         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
    228     };                                                                                     \
    229                                                                                            \
    230     VSplit4()                                                                              \
    231     {                                                                                      \
    232         support = checkHardwareSupport(CV_CPU_SSE2);                                       \
    233     }                                                                                      \
    234                                                                                            \
    235     void operator()(const data_type * src, data_type * dst0, data_type * dst1,             \
    236                     data_type * dst2, data_type * dst3) const                              \
    237     {                                                                                      \
    238         reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
    239         reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
    240         reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
    241         reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
    242         reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
    243         reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
    244         reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
    245         reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
    246                                                                                            \
    247         _mm_deinterleave(v_src0, v_src1, v_src2, v_src3,                                   \
    248                          v_src4, v_src5, v_src6, v_src7);                                  \
    249                                                                                            \
    250         _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
    251         _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
    252         _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
    253         _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
    254         _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
    255         _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
    256         _mm_storeu_##flavor((cast_type *)(dst3), v_src6);                                  \
    257         _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7);                   \
    258     }                                                                                      \
    259                                                                                            \
    260     bool support;                                                                          \
    261 }
    262 
    263 SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
    264 SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
    265 SPLIT2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
    266 
    267 SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
    268 SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
    269 SPLIT3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
    270 
    271 SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
    272 SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
    273 SPLIT4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
    274 
    275 #endif
    276 
    277 template<typename T> static void
    278 split_( const T* src, T** dst, int len, int cn )
    279 {
    280     int k = cn % 4 ? cn % 4 : 4;
    281     int i, j;
    282     if( k == 1 )
    283     {
    284         T* dst0 = dst[0];
    285 
    286         if(cn == 1)
    287         {
    288             memcpy(dst0, src, len * sizeof(T));
    289         }
    290         else
    291         {
    292             for( i = 0, j = 0 ; i < len; i++, j += cn )
    293                 dst0[i] = src[j];
    294         }
    295     }
    296     else if( k == 2 )
    297     {
    298         T *dst0 = dst[0], *dst1 = dst[1];
    299         i = j = 0;
    300 
    301 #if CV_NEON
    302         if(cn == 2)
    303         {
    304             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
    305             int inc_j = 2 * inc_i;
    306 
    307             VSplit2<T> vsplit;
    308             for( ; i < len - inc_i; i += inc_i, j += inc_j)
    309                 vsplit(src + j, dst0 + i, dst1 + i);
    310         }
    311 #elif CV_SSE2
    312         if (cn == 2)
    313         {
    314             int inc_i = 32/sizeof(T);
    315             int inc_j = 2 * inc_i;
    316 
    317             VSplit2<T> vsplit;
    318             if (vsplit.support)
    319             {
    320                 for( ; i <= len - inc_i; i += inc_i, j += inc_j)
    321                     vsplit(src + j, dst0 + i, dst1 + i);
    322             }
    323         }
    324 #endif
    325         for( ; i < len; i++, j += cn )
    326         {
    327             dst0[i] = src[j];
    328             dst1[i] = src[j+1];
    329         }
    330     }
    331     else if( k == 3 )
    332     {
    333         T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
    334         i = j = 0;
    335 
    336 #if CV_NEON
    337         if(cn == 3)
    338         {
    339             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
    340             int inc_j = 3 * inc_i;
    341 
    342             VSplit3<T> vsplit;
    343             for( ; i <= len - inc_i; i += inc_i, j += inc_j)
    344                 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
    345         }
    346 #elif CV_SSE2
    347         if (cn == 3)
    348         {
    349             int inc_i = 32/sizeof(T);
    350             int inc_j = 3 * inc_i;
    351 
    352             VSplit3<T> vsplit;
    353 
    354             if (vsplit.support)
    355             {
    356                 for( ; i <= len - inc_i; i += inc_i, j += inc_j)
    357                     vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
    358             }
    359         }
    360 #endif
    361         for( ; i < len; i++, j += cn )
    362         {
    363             dst0[i] = src[j];
    364             dst1[i] = src[j+1];
    365             dst2[i] = src[j+2];
    366         }
    367     }
    368     else
    369     {
    370         T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
    371         i = j = 0;
    372 
    373 #if CV_NEON
    374         if(cn == 4)
    375         {
    376             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
    377             int inc_j = 4 * inc_i;
    378 
    379             VSplit4<T> vsplit;
    380             for( ; i <= len - inc_i; i += inc_i, j += inc_j)
    381                 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
    382         }
    383 #elif CV_SSE2
    384         if (cn == 4)
    385         {
    386             int inc_i = 32/sizeof(T);
    387             int inc_j = 4 * inc_i;
    388 
    389             VSplit4<T> vsplit;
    390             if (vsplit.support)
    391             {
    392                 for( ; i <= len - inc_i; i += inc_i, j += inc_j)
    393                     vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
    394             }
    395         }
    396 #endif
    397         for( ; i < len; i++, j += cn )
    398         {
    399             dst0[i] = src[j]; dst1[i] = src[j+1];
    400             dst2[i] = src[j+2]; dst3[i] = src[j+3];
    401         }
    402     }
    403 
    404     for( ; k < cn; k += 4 )
    405     {
    406         T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
    407         for( i = 0, j = k; i < len; i++, j += cn )
    408         {
    409             dst0[i] = src[j]; dst1[i] = src[j+1];
    410             dst2[i] = src[j+2]; dst3[i] = src[j+3];
    411         }
    412     }
    413 }
    414 
    415 
    416 #if CV_NEON
    417 template<typename T> struct VMerge2;
    418 template<typename T> struct VMerge3;
    419 template<typename T> struct VMerge4;
    420 
    421 #define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
    422     template<>                                                                    \
    423     struct name<data_type>{                                                       \
    424         void operator()(const data_type* src0, const data_type* src1,             \
    425                         data_type* dst){                                          \
    426             reg_type r;                                                           \
    427             r.val[0] = load_func(src0);                                           \
    428             r.val[1] = load_func(src1);                                           \
    429             store_func(dst, r);                                                   \
    430         }                                                                         \
    431     }
    432 
    433 #define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
    434     template<>                                                                    \
    435     struct name<data_type>{                                                       \
    436         void operator()(const data_type* src0, const data_type* src1,             \
    437                         const data_type* src2, data_type* dst){                   \
    438             reg_type r;                                                           \
    439             r.val[0] = load_func(src0);                                           \
    440             r.val[1] = load_func(src1);                                           \
    441             r.val[2] = load_func(src2);                                           \
    442             store_func(dst, r);                                                   \
    443         }                                                                         \
    444     }
    445 
    446 #define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
    447     template<>                                                                    \
    448     struct name<data_type>{                                                       \
    449         void operator()(const data_type* src0, const data_type* src1,             \
    450                         const data_type* src2, const data_type* src3,             \
    451                         data_type* dst){                                          \
    452             reg_type r;                                                           \
    453             r.val[0] = load_func(src0);                                           \
    454             r.val[1] = load_func(src1);                                           \
    455             r.val[2] = load_func(src2);                                           \
    456             r.val[3] = load_func(src3);                                           \
    457             store_func(dst, r);                                                   \
    458         }                                                                         \
    459     }
    460 
    461 MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
    462 MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
    463 MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
    464 MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
    465 
    466 MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
    467 MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
    468 MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
    469 MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
    470 
    471 MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
    472 MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
    473 MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
    474 MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
    475 
    476 #elif CV_SSE2
    477 
    478 template <typename T>
    479 struct VMerge2
    480 {
    481     VMerge2() : support(false) { }
    482     void operator()(const T *, const T *, T *) const { }
    483 
    484     bool support;
    485 };
    486 
    487 template <typename T>
    488 struct VMerge3
    489 {
    490     VMerge3() : support(false) { }
    491     void operator()(const T *, const T *, const T *, T *) const { }
    492 
    493     bool support;
    494 };
    495 
    496 template <typename T>
    497 struct VMerge4
    498 {
    499     VMerge4() : support(false) { }
    500     void operator()(const T *, const T *, const T *, const T *, T *) const { }
    501 
    502     bool support;
    503 };
    504 
    505 #define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
    506 template <>                                                                                \
    507 struct VMerge2<data_type>                                                                  \
    508 {                                                                                          \
    509     enum                                                                                   \
    510     {                                                                                      \
    511         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
    512     };                                                                                     \
    513                                                                                            \
    514     VMerge2()                                                                              \
    515     {                                                                                      \
    516         support = checkHardwareSupport(se);                                                \
    517     }                                                                                      \
    518                                                                                            \
    519     void operator()(const data_type * src0, const data_type * src1,                        \
    520                     data_type * dst) const                                                 \
    521     {                                                                                      \
    522         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
    523         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
    524         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
    525         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
    526                                                                                            \
    527         _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
    528                                                                                            \
    529         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
    530         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
    531         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
    532         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
    533     }                                                                                      \
    534                                                                                            \
    535     bool support;                                                                          \
    536 }
    537 
    538 #define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
    539 template <>                                                                                \
    540 struct VMerge3<data_type>                                                                  \
    541 {                                                                                          \
    542     enum                                                                                   \
    543     {                                                                                      \
    544         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
    545     };                                                                                     \
    546                                                                                            \
    547     VMerge3()                                                                              \
    548     {                                                                                      \
    549         support = checkHardwareSupport(se);                                                \
    550     }                                                                                      \
    551                                                                                            \
    552     void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
    553                     data_type * dst) const                                                 \
    554     {                                                                                      \
    555         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
    556         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
    557         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
    558         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
    559         reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
    560         reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
    561                                                                                            \
    562         _mm_interleave(v_src0, v_src1, v_src2,                                             \
    563                        v_src3, v_src4, v_src5);                                            \
    564                                                                                            \
    565         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
    566         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
    567         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
    568         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
    569         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
    570         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
    571     }                                                                                      \
    572                                                                                            \
    573     bool support;                                                                          \
    574 }
    575 
    576 #define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
    577 template <>                                                                                \
    578 struct VMerge4<data_type>                                                                  \
    579 {                                                                                          \
    580     enum                                                                                   \
    581     {                                                                                      \
    582         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
    583     };                                                                                     \
    584                                                                                            \
    585     VMerge4()                                                                              \
    586     {                                                                                      \
    587         support = checkHardwareSupport(se);                                                \
    588     }                                                                                      \
    589                                                                                            \
    590     void operator()(const data_type * src0, const data_type * src1,                        \
    591                     const data_type * src2, const data_type * src3,                        \
    592                     data_type * dst) const                                                 \
    593     {                                                                                      \
    594         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
    595         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
    596         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
    597         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
    598         reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
    599         reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
    600         reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
    601         reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
    602                                                                                            \
    603         _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
    604                        v_src4, v_src5, v_src6, v_src7);                                    \
    605                                                                                            \
    606         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
    607         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
    608         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
    609         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
    610         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
    611         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
    612         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
    613         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
    614     }                                                                                      \
    615                                                                                            \
    616     bool support;                                                                          \
    617 }
    618 
    619 MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
    620 MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
    621 MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
    622 
    623 #if CV_SSE4_1
    624 MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
    625 MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
    626 MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
    627 #endif
    628 
    629 MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
    630 MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
    631 MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
    632 
    633 #endif
    634 
    635 template<typename T> static void
    636 merge_( const T** src, T* dst, int len, int cn )
    637 {
    638     int k = cn % 4 ? cn % 4 : 4;
    639     int i, j;
    640     if( k == 1 )
    641     {
    642         const T* src0 = src[0];
    643         for( i = j = 0; i < len; i++, j += cn )
    644             dst[j] = src0[i];
    645     }
    646     else if( k == 2 )
    647     {
    648         const T *src0 = src[0], *src1 = src[1];
    649         i = j = 0;
    650 #if CV_NEON
    651         if(cn == 2)
    652         {
    653             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
    654             int inc_j = 2 * inc_i;
    655 
    656             VMerge2<T> vmerge;
    657             for( ; i < len - inc_i; i += inc_i, j += inc_j)
    658                 vmerge(src0 + i, src1 + i, dst + j);
    659         }
    660 #elif CV_SSE2
    661         if(cn == 2)
    662         {
    663             int inc_i = 32/sizeof(T);
    664             int inc_j = 2 * inc_i;
    665 
    666             VMerge2<T> vmerge;
    667             if (vmerge.support)
    668                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
    669                     vmerge(src0 + i, src1 + i, dst + j);
    670         }
    671 #endif
    672         for( ; i < len; i++, j += cn )
    673         {
    674             dst[j] = src0[i];
    675             dst[j+1] = src1[i];
    676         }
    677     }
    678     else if( k == 3 )
    679     {
    680         const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
    681         i = j = 0;
    682 #if CV_NEON
    683         if(cn == 3)
    684         {
    685             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
    686             int inc_j = 3 * inc_i;
    687 
    688             VMerge3<T> vmerge;
    689             for( ; i < len - inc_i; i += inc_i, j += inc_j)
    690                 vmerge(src0 + i, src1 + i, src2 + i, dst + j);
    691         }
    692 #elif CV_SSE2
    693         if(cn == 3)
    694         {
    695             int inc_i = 32/sizeof(T);
    696             int inc_j = 3 * inc_i;
    697 
    698             VMerge3<T> vmerge;
    699             if (vmerge.support)
    700                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
    701                     vmerge(src0 + i, src1 + i, src2 + i, dst + j);
    702         }
    703 #endif
    704         for( ; i < len; i++, j += cn )
    705         {
    706             dst[j] = src0[i];
    707             dst[j+1] = src1[i];
    708             dst[j+2] = src2[i];
    709         }
    710     }
    711     else
    712     {
    713         const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
    714         i = j = 0;
    715 #if CV_NEON
    716         if(cn == 4)
    717         {
    718             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
    719             int inc_j = 4 * inc_i;
    720 
    721             VMerge4<T> vmerge;
    722             for( ; i < len - inc_i; i += inc_i, j += inc_j)
    723                 vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
    724         }
    725 #elif CV_SSE2
    726         if(cn == 4)
    727         {
    728             int inc_i = 32/sizeof(T);
    729             int inc_j = 4 * inc_i;
    730 
    731             VMerge4<T> vmerge;
    732             if (vmerge.support)
    733                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
    734                     vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
    735         }
    736 #endif
    737         for( ; i < len; i++, j += cn )
    738         {
    739             dst[j] = src0[i]; dst[j+1] = src1[i];
    740             dst[j+2] = src2[i]; dst[j+3] = src3[i];
    741         }
    742     }
    743 
    744     for( ; k < cn; k += 4 )
    745     {
    746         const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
    747         for( i = 0, j = k; i < len; i++, j += cn )
    748         {
    749             dst[j] = src0[i]; dst[j+1] = src1[i];
    750             dst[j+2] = src2[i]; dst[j+3] = src3[i];
    751         }
    752     }
    753 }
    754 
    755 static void split8u(const uchar* src, uchar** dst, int len, int cn )
    756 {
    757     split_(src, dst, len, cn);
    758 }
    759 
    760 static void split16u(const ushort* src, ushort** dst, int len, int cn )
    761 {
    762     split_(src, dst, len, cn);
    763 }
    764 
    765 static void split32s(const int* src, int** dst, int len, int cn )
    766 {
    767     split_(src, dst, len, cn);
    768 }
    769 
    770 static void split64s(const int64* src, int64** dst, int len, int cn )
    771 {
    772     split_(src, dst, len, cn);
    773 }
    774 
    775 static void merge8u(const uchar** src, uchar* dst, int len, int cn )
    776 {
    777     merge_(src, dst, len, cn);
    778 }
    779 
    780 static void merge16u(const ushort** src, ushort* dst, int len, int cn )
    781 {
    782     merge_(src, dst, len, cn);
    783 }
    784 
    785 static void merge32s(const int** src, int* dst, int len, int cn )
    786 {
    787     merge_(src, dst, len, cn);
    788 }
    789 
    790 static void merge64s(const int64** src, int64* dst, int len, int cn )
    791 {
    792     merge_(src, dst, len, cn);
    793 }
    794 
    795 typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn);
    796 typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
    797 
    798 static SplitFunc getSplitFunc(int depth)
    799 {
    800     static SplitFunc splitTab[] =
    801     {
    802         (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split16u), (SplitFunc)GET_OPTIMIZED(split16u),
    803         (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split64s), 0
    804     };
    805 
    806     return splitTab[depth];
    807 }
    808 
    809 static MergeFunc getMergeFunc(int depth)
    810 {
    811     static MergeFunc mergeTab[] =
    812     {
    813         (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge16u), (MergeFunc)GET_OPTIMIZED(merge16u),
    814         (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge64s), 0
    815     };
    816 
    817     return mergeTab[depth];
    818 }
    819 
    820 }
    821 
    822 void cv::split(const Mat& src, Mat* mv)
    823 {
    824     int k, depth = src.depth(), cn = src.channels();
    825     if( cn == 1 )
    826     {
    827         src.copyTo(mv[0]);
    828         return;
    829     }
    830 
    831     SplitFunc func = getSplitFunc(depth);
    832     CV_Assert( func != 0 );
    833 
    834     int esz = (int)src.elemSize(), esz1 = (int)src.elemSize1();
    835     int blocksize0 = (BLOCK_SIZE + esz-1)/esz;
    836     AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
    837     const Mat** arrays = (const Mat**)(uchar*)_buf;
    838     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
    839 
    840     arrays[0] = &src;
    841     for( k = 0; k < cn; k++ )
    842     {
    843         mv[k].create(src.dims, src.size, depth);
    844         arrays[k+1] = &mv[k];
    845     }
    846 
    847     NAryMatIterator it(arrays, ptrs, cn+1);
    848     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
    849 
    850     for( size_t i = 0; i < it.nplanes; i++, ++it )
    851     {
    852         for( int j = 0; j < total; j += blocksize )
    853         {
    854             int bsz = std::min(total - j, blocksize);
    855             func( ptrs[0], &ptrs[1], bsz, cn );
    856 
    857             if( j + blocksize < total )
    858             {
    859                 ptrs[0] += bsz*esz;
    860                 for( k = 0; k < cn; k++ )
    861                     ptrs[k+1] += bsz*esz1;
    862             }
    863         }
    864     }
    865 }
    866 
    867 #ifdef HAVE_OPENCL
    868 
    869 namespace cv {
    870 
    871 static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
    872 {
    873     int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
    874             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
    875 
    876     String dstargs, processelem, indexdecl;
    877     for (int i = 0; i < cn; ++i)
    878     {
    879         dstargs += format("DECLARE_DST_PARAM(%d)", i);
    880         indexdecl += format("DECLARE_INDEX(%d)", i);
    881         processelem += format("PROCESS_ELEM(%d)", i);
    882     }
    883 
    884     ocl::Kernel k("split", ocl::core::split_merge_oclsrc,
    885                   format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s"
    886                          " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s",
    887                          ocl::memopTypeToStr(depth), cn, dstargs.c_str(),
    888                          processelem.c_str(), indexdecl.c_str()));
    889     if (k.empty())
    890         return false;
    891 
    892     Size size = _m.size();
    893     _mv.create(cn, 1, depth);
    894     for (int i = 0; i < cn; ++i)
    895         _mv.create(size, depth, i);
    896 
    897     std::vector<UMat> dst;
    898     _mv.getUMatVector(dst);
    899 
    900     int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat()));
    901     for (int i = 0; i < cn; ++i)
    902         argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i]));
    903     k.set(argidx, rowsPerWI);
    904 
    905     size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI };
    906     return k.run(2, globalsize, NULL, false);
    907 }
    908 
    909 }
    910 
    911 #endif
    912 
    913 void cv::split(InputArray _m, OutputArrayOfArrays _mv)
    914 {
    915     CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(),
    916                ocl_split(_m, _mv))
    917 
    918     Mat m = _m.getMat();
    919     if( m.empty() )
    920     {
    921         _mv.release();
    922         return;
    923     }
    924 
    925     CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() );
    926 
    927     Size size = m.size();
    928     int depth = m.depth(), cn = m.channels();
    929     _mv.create(cn, 1, depth);
    930     for (int i = 0; i < cn; ++i)
    931         _mv.create(size, depth, i);
    932 
    933     std::vector<Mat> dst;
    934     _mv.getMatVector(dst);
    935 
    936     split(m, &dst[0]);
    937 }
    938 
    939 void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
    940 {
    941     CV_Assert( mv && n > 0 );
    942 
    943     int depth = mv[0].depth();
    944     bool allch1 = true;
    945     int k, cn = 0;
    946     size_t i;
    947 
    948     for( i = 0; i < n; i++ )
    949     {
    950         CV_Assert(mv[i].size == mv[0].size && mv[i].depth() == depth);
    951         allch1 = allch1 && mv[i].channels() == 1;
    952         cn += mv[i].channels();
    953     }
    954 
    955     CV_Assert( 0 < cn && cn <= CV_CN_MAX );
    956     _dst.create(mv[0].dims, mv[0].size, CV_MAKETYPE(depth, cn));
    957     Mat dst = _dst.getMat();
    958 
    959     if( n == 1 )
    960     {
    961         mv[0].copyTo(dst);
    962         return;
    963     }
    964 
    965     if( !allch1 )
    966     {
    967         AutoBuffer<int> pairs(cn*2);
    968         int j, ni=0;
    969 
    970         for( i = 0, j = 0; i < n; i++, j += ni )
    971         {
    972             ni = mv[i].channels();
    973             for( k = 0; k < ni; k++ )
    974             {
    975                 pairs[(j+k)*2] = j + k;
    976                 pairs[(j+k)*2+1] = j + k;
    977             }
    978         }
    979         mixChannels( mv, n, &dst, 1, &pairs[0], cn );
    980         return;
    981     }
    982 
    983     size_t esz = dst.elemSize(), esz1 = dst.elemSize1();
    984     int blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz);
    985     AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
    986     const Mat** arrays = (const Mat**)(uchar*)_buf;
    987     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
    988 
    989     arrays[0] = &dst;
    990     for( k = 0; k < cn; k++ )
    991         arrays[k+1] = &mv[k];
    992 
    993     NAryMatIterator it(arrays, ptrs, cn+1);
    994     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
    995     MergeFunc func = getMergeFunc(depth);
    996 
    997     for( i = 0; i < it.nplanes; i++, ++it )
    998     {
    999         for( int j = 0; j < total; j += blocksize )
   1000         {
   1001             int bsz = std::min(total - j, blocksize);
   1002             func( (const uchar**)&ptrs[1], ptrs[0], bsz, cn );
   1003 
   1004             if( j + blocksize < total )
   1005             {
   1006                 ptrs[0] += bsz*esz;
   1007                 for( int t = 0; t < cn; t++ )
   1008                     ptrs[t+1] += bsz*esz1;
   1009             }
   1010         }
   1011     }
   1012 }
   1013 
   1014 #ifdef HAVE_OPENCL
   1015 
   1016 namespace cv {
   1017 
   1018 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
   1019 {
   1020     std::vector<UMat> src, ksrc;
   1021     _mv.getUMatVector(src);
   1022     CV_Assert(!src.empty());
   1023 
   1024     int type = src[0].type(), depth = CV_MAT_DEPTH(type),
   1025             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
   1026     Size size = src[0].size();
   1027 
   1028     for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
   1029     {
   1030         int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype),
   1031                 esz1 = CV_ELEM_SIZE1(idepth);
   1032         if (src[i].dims > 2)
   1033             return false;
   1034 
   1035         CV_Assert(size == src[i].size() && depth == idepth);
   1036 
   1037         for (int cn = 0; cn < icn; ++cn)
   1038         {
   1039             UMat tsrc = src[i];
   1040             tsrc.offset += cn * esz1;
   1041             ksrc.push_back(tsrc);
   1042         }
   1043     }
   1044     int dcn = (int)ksrc.size();
   1045 
   1046     String srcargs, processelem, cndecl, indexdecl;
   1047     for (int i = 0; i < dcn; ++i)
   1048     {
   1049         srcargs += format("DECLARE_SRC_PARAM(%d)", i);
   1050         processelem += format("PROCESS_ELEM(%d)", i);
   1051         indexdecl += format("DECLARE_INDEX(%d)", i);
   1052         cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
   1053     }
   1054 
   1055     ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
   1056                   format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
   1057                          " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s",
   1058                          dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
   1059                          indexdecl.c_str(), processelem.c_str(), cndecl.c_str()));
   1060     if (k.empty())
   1061         return false;
   1062 
   1063     _dst.create(size, CV_MAKE_TYPE(depth, dcn));
   1064     UMat dst = _dst.getUMat();
   1065 
   1066     int argidx = 0;
   1067     for (int i = 0; i < dcn; ++i)
   1068         argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
   1069     argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst));
   1070     k.set(argidx, rowsPerWI);
   1071 
   1072     size_t globalsize[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
   1073     return k.run(2, globalsize, NULL, false);
   1074 }
   1075 
   1076 }
   1077 
   1078 #endif
   1079 
   1080 void cv::merge(InputArrayOfArrays _mv, OutputArray _dst)
   1081 {
   1082     CV_OCL_RUN(_mv.isUMatVector() && _dst.isUMat(),
   1083                ocl_merge(_mv, _dst))
   1084 
   1085     std::vector<Mat> mv;
   1086     _mv.getMatVector(mv);
   1087     merge(!mv.empty() ? &mv[0] : 0, mv.size(), _dst);
   1088 }
   1089 
   1090 /****************************************************************************************\
   1091 *                       Generalized split/merge: mixing channels                         *
   1092 \****************************************************************************************/
   1093 
   1094 namespace cv
   1095 {
   1096 
   1097 template<typename T> static void
   1098 mixChannels_( const T** src, const int* sdelta,
   1099               T** dst, const int* ddelta,
   1100               int len, int npairs )
   1101 {
   1102     int i, k;
   1103     for( k = 0; k < npairs; k++ )
   1104     {
   1105         const T* s = src[k];
   1106         T* d = dst[k];
   1107         int ds = sdelta[k], dd = ddelta[k];
   1108         if( s )
   1109         {
   1110             for( i = 0; i <= len - 2; i += 2, s += ds*2, d += dd*2 )
   1111             {
   1112                 T t0 = s[0], t1 = s[ds];
   1113                 d[0] = t0; d[dd] = t1;
   1114             }
   1115             if( i < len )
   1116                 d[0] = s[0];
   1117         }
   1118         else
   1119         {
   1120             for( i = 0; i <= len - 2; i += 2, d += dd*2 )
   1121                 d[0] = d[dd] = 0;
   1122             if( i < len )
   1123                 d[0] = 0;
   1124         }
   1125     }
   1126 }
   1127 
   1128 
   1129 static void mixChannels8u( const uchar** src, const int* sdelta,
   1130                            uchar** dst, const int* ddelta,
   1131                            int len, int npairs )
   1132 {
   1133     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
   1134 }
   1135 
   1136 static void mixChannels16u( const ushort** src, const int* sdelta,
   1137                             ushort** dst, const int* ddelta,
   1138                             int len, int npairs )
   1139 {
   1140     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
   1141 }
   1142 
   1143 static void mixChannels32s( const int** src, const int* sdelta,
   1144                             int** dst, const int* ddelta,
   1145                             int len, int npairs )
   1146 {
   1147     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
   1148 }
   1149 
   1150 static void mixChannels64s( const int64** src, const int* sdelta,
   1151                             int64** dst, const int* ddelta,
   1152                             int len, int npairs )
   1153 {
   1154     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
   1155 }
   1156 
   1157 typedef void (*MixChannelsFunc)( const uchar** src, const int* sdelta,
   1158         uchar** dst, const int* ddelta, int len, int npairs );
   1159 
   1160 static MixChannelsFunc getMixchFunc(int depth)
   1161 {
   1162     static MixChannelsFunc mixchTab[] =
   1163     {
   1164         (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels16u,
   1165         (MixChannelsFunc)mixChannels16u, (MixChannelsFunc)mixChannels32s, (MixChannelsFunc)mixChannels32s,
   1166         (MixChannelsFunc)mixChannels64s, 0
   1167     };
   1168 
   1169     return mixchTab[depth];
   1170 }
   1171 
   1172 }
   1173 
   1174 void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, const int* fromTo, size_t npairs )
   1175 {
   1176     if( npairs == 0 )
   1177         return;
   1178     CV_Assert( src && nsrcs > 0 && dst && ndsts > 0 && fromTo && npairs > 0 );
   1179 
   1180     size_t i, j, k, esz1 = dst[0].elemSize1();
   1181     int depth = dst[0].depth();
   1182 
   1183     AutoBuffer<uchar> buf((nsrcs + ndsts + 1)*(sizeof(Mat*) + sizeof(uchar*)) + npairs*(sizeof(uchar*)*2 + sizeof(int)*6));
   1184     const Mat** arrays = (const Mat**)(uchar*)buf;
   1185     uchar** ptrs = (uchar**)(arrays + nsrcs + ndsts);
   1186     const uchar** srcs = (const uchar**)(ptrs + nsrcs + ndsts + 1);
   1187     uchar** dsts = (uchar**)(srcs + npairs);
   1188     int* tab = (int*)(dsts + npairs);
   1189     int *sdelta = (int*)(tab + npairs*4), *ddelta = sdelta + npairs;
   1190 
   1191     for( i = 0; i < nsrcs; i++ )
   1192         arrays[i] = &src[i];
   1193     for( i = 0; i < ndsts; i++ )
   1194         arrays[i + nsrcs] = &dst[i];
   1195     ptrs[nsrcs + ndsts] = 0;
   1196 
   1197     for( i = 0; i < npairs; i++ )
   1198     {
   1199         int i0 = fromTo[i*2], i1 = fromTo[i*2+1];
   1200         if( i0 >= 0 )
   1201         {
   1202             for( j = 0; j < nsrcs; i0 -= src[j].channels(), j++ )
   1203                 if( i0 < src[j].channels() )
   1204                     break;
   1205             CV_Assert(j < nsrcs && src[j].depth() == depth);
   1206             tab[i*4] = (int)j; tab[i*4+1] = (int)(i0*esz1);
   1207             sdelta[i] = src[j].channels();
   1208         }
   1209         else
   1210         {
   1211             tab[i*4] = (int)(nsrcs + ndsts); tab[i*4+1] = 0;
   1212             sdelta[i] = 0;
   1213         }
   1214 
   1215         for( j = 0; j < ndsts; i1 -= dst[j].channels(), j++ )
   1216             if( i1 < dst[j].channels() )
   1217                 break;
   1218         CV_Assert(i1 >= 0 && j < ndsts && dst[j].depth() == depth);
   1219         tab[i*4+2] = (int)(j + nsrcs); tab[i*4+3] = (int)(i1*esz1);
   1220         ddelta[i] = dst[j].channels();
   1221     }
   1222 
   1223     NAryMatIterator it(arrays, ptrs, (int)(nsrcs + ndsts));
   1224     int total = (int)it.size, blocksize = std::min(total, (int)((BLOCK_SIZE + esz1-1)/esz1));
   1225     MixChannelsFunc func = getMixchFunc(depth);
   1226 
   1227     for( i = 0; i < it.nplanes; i++, ++it )
   1228     {
   1229         for( k = 0; k < npairs; k++ )
   1230         {
   1231             srcs[k] = ptrs[tab[k*4]] + tab[k*4+1];
   1232             dsts[k] = ptrs[tab[k*4+2]] + tab[k*4+3];
   1233         }
   1234 
   1235         for( int t = 0; t < total; t += blocksize )
   1236         {
   1237             int bsz = std::min(total - t, blocksize);
   1238             func( srcs, sdelta, dsts, ddelta, bsz, (int)npairs );
   1239 
   1240             if( t + blocksize < total )
   1241                 for( k = 0; k < npairs; k++ )
   1242                 {
   1243                     srcs[k] += blocksize*sdelta[k]*esz1;
   1244                     dsts[k] += blocksize*ddelta[k]*esz1;
   1245                 }
   1246         }
   1247     }
   1248 }
   1249 
   1250 #ifdef HAVE_OPENCL
   1251 
   1252 namespace cv {
   1253 
   1254 static void getUMatIndex(const std::vector<UMat> & um, int cn, int & idx, int & cnidx)
   1255 {
   1256     int totalChannels = 0;
   1257     for (size_t i = 0, size = um.size(); i < size; ++i)
   1258     {
   1259         int ccn = um[i].channels();
   1260         totalChannels += ccn;
   1261 
   1262         if (totalChannels == cn)
   1263         {
   1264             idx = (int)(i + 1);
   1265             cnidx = 0;
   1266             return;
   1267         }
   1268         else if (totalChannels > cn)
   1269         {
   1270             idx = (int)i;
   1271             cnidx = i == 0 ? cn : (cn - totalChannels + ccn);
   1272             return;
   1273         }
   1274     }
   1275 
   1276     idx = cnidx = -1;
   1277 }
   1278 
   1279 static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _dst,
   1280                             const int* fromTo, size_t npairs)
   1281 {
   1282     std::vector<UMat> src, dst;
   1283     _src.getUMatVector(src);
   1284     _dst.getUMatVector(dst);
   1285 
   1286     size_t nsrc = src.size(), ndst = dst.size();
   1287     CV_Assert(nsrc > 0 && ndst > 0);
   1288 
   1289     Size size = src[0].size();
   1290     int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth),
   1291             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
   1292 
   1293     for (size_t i = 1, ssize = src.size(); i < ssize; ++i)
   1294         CV_Assert(src[i].size() == size && src[i].depth() == depth);
   1295     for (size_t i = 0, dsize = dst.size(); i < dsize; ++i)
   1296         CV_Assert(dst[i].size() == size && dst[i].depth() == depth);
   1297 
   1298     String declsrc, decldst, declproc, declcn, indexdecl;
   1299     std::vector<UMat> srcargs(npairs), dstargs(npairs);
   1300 
   1301     for (size_t i = 0; i < npairs; ++i)
   1302     {
   1303         int scn = fromTo[i<<1], dcn = fromTo[(i<<1) + 1];
   1304         int src_idx, src_cnidx, dst_idx, dst_cnidx;
   1305 
   1306         getUMatIndex(src, scn, src_idx, src_cnidx);
   1307         getUMatIndex(dst, dcn, dst_idx, dst_cnidx);
   1308 
   1309         CV_Assert(dst_idx >= 0 && src_idx >= 0);
   1310 
   1311         srcargs[i] = src[src_idx];
   1312         srcargs[i].offset += src_cnidx * esz;
   1313 
   1314         dstargs[i] = dst[dst_idx];
   1315         dstargs[i].offset += dst_cnidx * esz;
   1316 
   1317         declsrc += format("DECLARE_INPUT_MAT(%d)", i);
   1318         decldst += format("DECLARE_OUTPUT_MAT(%d)", i);
   1319         indexdecl += format("DECLARE_INDEX(%d)", i);
   1320         declproc += format("PROCESS_ELEM(%d)", i);
   1321         declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels());
   1322     }
   1323 
   1324     ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc,
   1325                   format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s"
   1326                          " -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s",
   1327                          ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(),
   1328                          declproc.c_str(), indexdecl.c_str(), declcn.c_str()));
   1329     if (k.empty())
   1330         return false;
   1331 
   1332     int argindex = 0;
   1333     for (size_t i = 0; i < npairs; ++i)
   1334         argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i]));
   1335     for (size_t i = 0; i < npairs; ++i)
   1336         argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i]));
   1337     argindex = k.set(argindex, size.height);
   1338     argindex = k.set(argindex, size.width);
   1339     k.set(argindex, rowsPerWI);
   1340 
   1341     size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI };
   1342     return k.run(2, globalsize, NULL, false);
   1343 }
   1344 
   1345 }
   1346 
   1347 #endif
   1348 
   1349 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
   1350                  const int* fromTo, size_t npairs)
   1351 {
   1352     if (npairs == 0 || fromTo == NULL)
   1353         return;
   1354 
   1355     CV_OCL_RUN(dst.isUMatVector(),
   1356                ocl_mixChannels(src, dst, fromTo, npairs))
   1357 
   1358     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
   1359             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
   1360             src.kind() != _InputArray::STD_VECTOR_UMAT;
   1361     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
   1362             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
   1363             dst.kind() != _InputArray::STD_VECTOR_UMAT;
   1364     int i;
   1365     int nsrc = src_is_mat ? 1 : (int)src.total();
   1366     int ndst = dst_is_mat ? 1 : (int)dst.total();
   1367 
   1368     CV_Assert(nsrc > 0 && ndst > 0);
   1369     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
   1370     Mat* buf = _buf;
   1371     for( i = 0; i < nsrc; i++ )
   1372         buf[i] = src.getMat(src_is_mat ? -1 : i);
   1373     for( i = 0; i < ndst; i++ )
   1374         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
   1375     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, fromTo, npairs);
   1376 }
   1377 
   1378 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
   1379                      const std::vector<int>& fromTo)
   1380 {
   1381     if (fromTo.empty())
   1382         return;
   1383 
   1384     CV_OCL_RUN(dst.isUMatVector(),
   1385                ocl_mixChannels(src, dst, &fromTo[0], fromTo.size()>>1))
   1386 
   1387     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
   1388             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
   1389             src.kind() != _InputArray::STD_VECTOR_UMAT;
   1390     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
   1391             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
   1392             dst.kind() != _InputArray::STD_VECTOR_UMAT;
   1393     int i;
   1394     int nsrc = src_is_mat ? 1 : (int)src.total();
   1395     int ndst = dst_is_mat ? 1 : (int)dst.total();
   1396 
   1397     CV_Assert(fromTo.size()%2 == 0 && nsrc > 0 && ndst > 0);
   1398     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
   1399     Mat* buf = _buf;
   1400     for( i = 0; i < nsrc; i++ )
   1401         buf[i] = src.getMat(src_is_mat ? -1 : i);
   1402     for( i = 0; i < ndst; i++ )
   1403         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
   1404     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, &fromTo[0], fromTo.size()/2);
   1405 }
   1406 
   1407 void cv::extractChannel(InputArray _src, OutputArray _dst, int coi)
   1408 {
   1409     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   1410     CV_Assert( 0 <= coi && coi < cn );
   1411     int ch[] = { coi, 0 };
   1412 
   1413     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
   1414     {
   1415         UMat src = _src.getUMat();
   1416         _dst.create(src.dims, &src.size[0], depth);
   1417         UMat dst = _dst.getUMat();
   1418         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
   1419         return;
   1420     }
   1421 
   1422     Mat src = _src.getMat();
   1423     _dst.create(src.dims, &src.size[0], depth);
   1424     Mat dst = _dst.getMat();
   1425     mixChannels(&src, 1, &dst, 1, ch, 1);
   1426 }
   1427 
   1428 void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi)
   1429 {
   1430     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
   1431     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
   1432     CV_Assert( _src.sameSize(_dst) && sdepth == ddepth );
   1433     CV_Assert( 0 <= coi && coi < dcn && scn == 1 );
   1434 
   1435     int ch[] = { 0, coi };
   1436     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
   1437     {
   1438         UMat src = _src.getUMat(), dst = _dst.getUMat();
   1439         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
   1440         return;
   1441     }
   1442 
   1443     Mat src = _src.getMat(), dst = _dst.getMat();
   1444     mixChannels(&src, 1, &dst, 1, ch, 1);
   1445 }
   1446 
   1447 /****************************************************************************************\
   1448 *                                convertScale[Abs]                                       *
   1449 \****************************************************************************************/
   1450 
   1451 namespace cv
   1452 {
   1453 
   1454 template<typename T, typename DT, typename WT>
   1455 struct cvtScaleAbs_SIMD
   1456 {
   1457     int operator () (const T *, DT *, int, WT, WT) const
   1458     {
   1459         return 0;
   1460     }
   1461 };
   1462 
   1463 #if CV_SSE2
   1464 
   1465 template <>
   1466 struct cvtScaleAbs_SIMD<uchar, uchar, float>
   1467 {
   1468     int operator () (const uchar * src, uchar * dst, int width,
   1469                      float scale, float shift) const
   1470     {
   1471         int x = 0;
   1472 
   1473         if (USE_SSE2)
   1474         {
   1475             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
   1476                 v_zero_f = _mm_setzero_ps();
   1477             __m128i v_zero_i = _mm_setzero_si128();
   1478 
   1479             for ( ; x <= width - 16; x += 16)
   1480             {
   1481                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
   1482                 __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i);
   1483                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift);
   1484                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
   1485                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift);
   1486                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
   1487                 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
   1488                 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
   1489                 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
   1490                 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
   1491 
   1492                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
   1493                                                    _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
   1494                 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
   1495             }
   1496         }
   1497 
   1498         return x;
   1499     }
   1500 };
   1501 
   1502 template <>
   1503 struct cvtScaleAbs_SIMD<schar, uchar, float>
   1504 {
   1505     int operator () (const schar * src, uchar * dst, int width,
   1506                      float scale, float shift) const
   1507     {
   1508         int x = 0;
   1509 
   1510         if (USE_SSE2)
   1511         {
   1512             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
   1513                 v_zero_f = _mm_setzero_ps();
   1514             __m128i v_zero_i = _mm_setzero_si128();
   1515 
   1516             for ( ; x <= width - 16; x += 16)
   1517             {
   1518                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
   1519                 __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8),
   1520                         v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8);
   1521                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
   1522                     _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
   1523                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
   1524                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
   1525                     _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
   1526                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
   1527                 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
   1528                     _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
   1529                 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
   1530                 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
   1531                     _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
   1532                 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
   1533 
   1534                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
   1535                                                    _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
   1536                 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
   1537             }
   1538         }
   1539 
   1540         return x;
   1541     }
   1542 };
   1543 
   1544 template <>
   1545 struct cvtScaleAbs_SIMD<ushort, uchar, float>
   1546 {
   1547     int operator () (const ushort * src, uchar * dst, int width,
   1548                      float scale, float shift) const
   1549     {
   1550         int x = 0;
   1551 
   1552         if (USE_SSE2)
   1553         {
   1554             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
   1555                 v_zero_f = _mm_setzero_ps();
   1556             __m128i v_zero_i = _mm_setzero_si128();
   1557 
   1558             for ( ; x <= width - 8; x += 8)
   1559             {
   1560                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
   1561                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift);
   1562                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
   1563                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift);
   1564                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
   1565 
   1566                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
   1567                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
   1568             }
   1569         }
   1570 
   1571         return x;
   1572     }
   1573 };
   1574 
   1575 template <>
   1576 struct cvtScaleAbs_SIMD<short, uchar, float>
   1577 {
   1578     int operator () (const short * src, uchar * dst, int width,
   1579                      float scale, float shift) const
   1580     {
   1581         int x = 0;
   1582 
   1583         if (USE_SSE2)
   1584         {
   1585             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
   1586                 v_zero_f = _mm_setzero_ps();
   1587             __m128i v_zero_i = _mm_setzero_si128();
   1588 
   1589             for ( ; x <= width - 8; x += 8)
   1590             {
   1591                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
   1592                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift);
   1593                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
   1594                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift);
   1595                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
   1596 
   1597                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
   1598                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
   1599             }
   1600         }
   1601 
   1602         return x;
   1603     }
   1604 };
   1605 
   1606 template <>
   1607 struct cvtScaleAbs_SIMD<int, uchar, float>
   1608 {
   1609     int operator () (const int * src, uchar * dst, int width,
   1610                      float scale, float shift) const
   1611     {
   1612         int x = 0;
   1613 
   1614         if (USE_SSE2)
   1615         {
   1616             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
   1617                 v_zero_f = _mm_setzero_ps();
   1618             __m128i v_zero_i = _mm_setzero_si128();
   1619 
   1620             for ( ; x <= width - 8; x += 4)
   1621             {
   1622                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
   1623                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
   1624                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
   1625 
   1626                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i);
   1627                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
   1628             }
   1629         }
   1630 
   1631         return x;
   1632     }
   1633 };
   1634 
   1635 template <>
   1636 struct cvtScaleAbs_SIMD<float, uchar, float>
   1637 {
   1638     int operator () (const float * src, uchar * dst, int width,
   1639                      float scale, float shift) const
   1640     {
   1641         int x = 0;
   1642 
   1643         if (USE_SSE2)
   1644         {
   1645             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
   1646                 v_zero_f = _mm_setzero_ps();
   1647             __m128i v_zero_i = _mm_setzero_si128();
   1648 
   1649             for ( ; x <= width - 8; x += 4)
   1650             {
   1651                 __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift);
   1652                 v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst);
   1653 
   1654                 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i);
   1655                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
   1656             }
   1657         }
   1658 
   1659         return x;
   1660     }
   1661 };
   1662 
   1663 template <>
   1664 struct cvtScaleAbs_SIMD<double, uchar, float>
   1665 {
   1666     int operator () (const double * src, uchar * dst, int width,
   1667                      float scale, float shift) const
   1668     {
   1669         int x = 0;
   1670 
   1671         if (USE_SSE2)
   1672         {
   1673             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
   1674                 v_zero_f = _mm_setzero_ps();
   1675             __m128i v_zero_i = _mm_setzero_si128();
   1676 
   1677             for ( ; x <= width - 8; x += 8)
   1678             {
   1679                 __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
   1680                                               _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
   1681                 __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
   1682                                               _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
   1683 
   1684                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift);
   1685                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
   1686 
   1687                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift);
   1688                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
   1689 
   1690                 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1),
   1691                                                   _mm_cvtps_epi32(v_dst2));
   1692 
   1693                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
   1694             }
   1695         }
   1696 
   1697         return x;
   1698     }
   1699 };
   1700 
   1701 #elif CV_NEON
   1702 
   1703 template <>
   1704 struct cvtScaleAbs_SIMD<uchar, uchar, float>
   1705 {
   1706     int operator () (const uchar * src, uchar * dst, int width,
   1707                      float scale, float shift) const
   1708     {
   1709         int x = 0;
   1710         float32x4_t v_shift = vdupq_n_f32(shift);
   1711 
   1712         for ( ; x <= width - 16; x += 16)
   1713         {
   1714             uint8x16_t v_src = vld1q_u8(src + x);
   1715             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
   1716 
   1717             uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half));
   1718             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
   1719             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
   1720 
   1721             v_quat = vmovl_u16(vget_high_u16(v_half));
   1722             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
   1723             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
   1724 
   1725             v_half = vmovl_u8(vget_high_u8(v_src));
   1726 
   1727             v_quat = vmovl_u16(vget_low_u16(v_half));
   1728             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
   1729             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
   1730 
   1731             v_quat = vmovl_u16(vget_high_u16(v_half));
   1732             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
   1733             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
   1734 
   1735             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
   1736                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
   1737             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
   1738                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
   1739 
   1740             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
   1741         }
   1742 
   1743         return x;
   1744     }
   1745 };
   1746 
   1747 template <>
   1748 struct cvtScaleAbs_SIMD<schar, uchar, float>
   1749 {
   1750     int operator () (const schar * src, uchar * dst, int width,
   1751                      float scale, float shift) const
   1752     {
   1753         int x = 0;
   1754         float32x4_t v_shift = vdupq_n_f32(shift);
   1755 
   1756         for ( ; x <= width - 16; x += 16)
   1757         {
   1758             int8x16_t v_src = vld1q_s8(src + x);
   1759             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
   1760 
   1761             int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half));
   1762             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
   1763             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
   1764 
   1765             v_quat = vmovl_s16(vget_high_s16(v_half));
   1766             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
   1767             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
   1768 
   1769             v_half = vmovl_s8(vget_high_s8(v_src));
   1770 
   1771             v_quat = vmovl_s16(vget_low_s16(v_half));
   1772             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
   1773             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
   1774 
   1775             v_quat = vmovl_s16(vget_high_s16(v_half));
   1776             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
   1777             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
   1778 
   1779             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
   1780                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
   1781             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
   1782                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
   1783 
   1784             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
   1785         }
   1786 
   1787         return x;
   1788     }
   1789 };
   1790 
   1791 template <>
   1792 struct cvtScaleAbs_SIMD<ushort, uchar, float>
   1793 {
   1794     int operator () (const ushort * src, uchar * dst, int width,
   1795                      float scale, float shift) const
   1796     {
   1797         int x = 0;
   1798         float32x4_t v_shift = vdupq_n_f32(shift);
   1799 
   1800         for ( ; x <= width - 8; x += 8)
   1801         {
   1802             uint16x8_t v_src = vld1q_u16(src + x);
   1803 
   1804             uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src));
   1805             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
   1806             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
   1807 
   1808             v_half = vmovl_u16(vget_high_u16(v_src));
   1809             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
   1810             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
   1811 
   1812             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
   1813                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
   1814 
   1815             vst1_u8(dst + x, vqmovn_u16(v_dst));
   1816         }
   1817 
   1818         return x;
   1819     }
   1820 };
   1821 
   1822 template <>
   1823 struct cvtScaleAbs_SIMD<short, uchar, float>
   1824 {
   1825     int operator () (const short * src, uchar * dst, int width,
   1826                      float scale, float shift) const
   1827     {
   1828         int x = 0;
   1829         float32x4_t v_shift = vdupq_n_f32(shift);
   1830 
   1831         for ( ; x <= width - 8; x += 8)
   1832         {
   1833             int16x8_t v_src = vld1q_s16(src + x);
   1834 
   1835             int32x4_t v_half = vmovl_s16(vget_low_s16(v_src));
   1836             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
   1837             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
   1838 
   1839             v_half = vmovl_s16(vget_high_s16(v_src));
   1840             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
   1841             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
   1842 
   1843             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
   1844                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
   1845 
   1846             vst1_u8(dst + x, vqmovn_u16(v_dst));
   1847         }
   1848 
   1849         return x;
   1850     }
   1851 };
   1852 
   1853 template <>
   1854 struct cvtScaleAbs_SIMD<int, uchar, float>
   1855 {
   1856     int operator () (const int * src, uchar * dst, int width,
   1857                      float scale, float shift) const
   1858     {
   1859         int x = 0;
   1860         float32x4_t v_shift = vdupq_n_f32(shift);
   1861 
   1862         for ( ; x <= width - 8; x += 8)
   1863         {
   1864             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale);
   1865             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
   1866             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
   1867 
   1868             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale);
   1869             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
   1870             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
   1871 
   1872             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
   1873             vst1_u8(dst + x, vqmovn_u16(v_dst));
   1874         }
   1875 
   1876         return x;
   1877     }
   1878 };
   1879 
   1880 template <>
   1881 struct cvtScaleAbs_SIMD<float, uchar, float>
   1882 {
   1883     int operator () (const float * src, uchar * dst, int width,
   1884                      float scale, float shift) const
   1885     {
   1886         int x = 0;
   1887         float32x4_t v_shift = vdupq_n_f32(shift);
   1888 
   1889         for ( ; x <= width - 8; x += 8)
   1890         {
   1891             float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale);
   1892             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
   1893             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
   1894 
   1895             float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale);
   1896             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
   1897             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
   1898 
   1899             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
   1900             vst1_u8(dst + x, vqmovn_u16(v_dst));
   1901         }
   1902 
   1903         return x;
   1904     }
   1905 };
   1906 
   1907 #endif
   1908 
   1909 template<typename T, typename DT, typename WT> static void
   1910 cvtScaleAbs_( const T* src, size_t sstep,
   1911               DT* dst, size_t dstep, Size size,
   1912               WT scale, WT shift )
   1913 {
   1914     sstep /= sizeof(src[0]);
   1915     dstep /= sizeof(dst[0]);
   1916     cvtScaleAbs_SIMD<T, DT, WT> vop;
   1917 
   1918     for( ; size.height--; src += sstep, dst += dstep )
   1919     {
   1920         int x = vop(src, dst, size.width, scale, shift);
   1921 
   1922         #if CV_ENABLE_UNROLLED
   1923         for( ; x <= size.width - 4; x += 4 )
   1924         {
   1925             DT t0, t1;
   1926             t0 = saturate_cast<DT>(std::abs(src[x]*scale + shift));
   1927             t1 = saturate_cast<DT>(std::abs(src[x+1]*scale + shift));
   1928             dst[x] = t0; dst[x+1] = t1;
   1929             t0 = saturate_cast<DT>(std::abs(src[x+2]*scale + shift));
   1930             t1 = saturate_cast<DT>(std::abs(src[x+3]*scale + shift));
   1931             dst[x+2] = t0; dst[x+3] = t1;
   1932         }
   1933         #endif
   1934         for( ; x < size.width; x++ )
   1935             dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
   1936     }
   1937 }
   1938 
   1939 template <typename T, typename DT, typename WT>
   1940 struct cvtScale_SIMD
   1941 {
   1942     int operator () (const T *, DT *, int, WT, WT) const
   1943     {
   1944         return 0;
   1945     }
   1946 };
   1947 
   1948 #if CV_SSE2
   1949 
   1950 // from uchar
   1951 
   1952 template <>
   1953 struct cvtScale_SIMD<uchar, uchar, float>
   1954 {
   1955     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
   1956     {
   1957         int x = 0;
   1958 
   1959         if (!USE_SSE2)
   1960             return x;
   1961 
   1962         __m128i v_zero = _mm_setzero_si128();
   1963         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   1964 
   1965         for ( ; x <= width - 8; x += 8)
   1966         {
   1967             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
   1968             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   1969             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   1970 
   1971             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   1972             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   1973 
   1974             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   1975                                             _mm_cvtps_epi32(v_dst_1));
   1976             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
   1977         }
   1978 
   1979         return x;
   1980     }
   1981 };
   1982 
   1983 template <>
   1984 struct cvtScale_SIMD<uchar, schar, float>
   1985 {
   1986     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
   1987     {
   1988         int x = 0;
   1989 
   1990         if (!USE_SSE2)
   1991             return x;
   1992 
   1993         __m128i v_zero = _mm_setzero_si128();
   1994         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   1995 
   1996         for ( ; x <= width - 8; x += 8)
   1997         {
   1998             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
   1999             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2000             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2001 
   2002             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2003             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2004 
   2005             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2006                                             _mm_cvtps_epi32(v_dst_1));
   2007             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
   2008         }
   2009 
   2010         return x;
   2011     }
   2012 };
   2013 
   2014 #if CV_SSE4_1
   2015 
   2016 template <>
   2017 struct cvtScale_SIMD<uchar, ushort, float>
   2018 {
   2019     cvtScale_SIMD()
   2020     {
   2021         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
   2022     }
   2023 
   2024     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
   2025     {
   2026         int x = 0;
   2027 
   2028         if (!haveSSE)
   2029             return x;
   2030 
   2031         __m128i v_zero = _mm_setzero_si128();
   2032         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2033 
   2034         for ( ; x <= width - 8; x += 8)
   2035         {
   2036             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
   2037             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2038             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2039 
   2040             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2041             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2042 
   2043             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
   2044                                              _mm_cvtps_epi32(v_dst_1));
   2045             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   2046         }
   2047 
   2048         return x;
   2049     }
   2050 
   2051     bool haveSSE;
   2052 };
   2053 
   2054 #endif
   2055 
   2056 template <>
   2057 struct cvtScale_SIMD<uchar, short, float>
   2058 {
   2059     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
   2060     {
   2061         int x = 0;
   2062 
   2063         if (!USE_SSE2)
   2064             return x;
   2065 
   2066         __m128i v_zero = _mm_setzero_si128();
   2067         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2068 
   2069         for ( ; x <= width - 8; x += 8)
   2070         {
   2071             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
   2072             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2073             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2074 
   2075             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2076             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2077 
   2078             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2079                                             _mm_cvtps_epi32(v_dst_1));
   2080             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   2081         }
   2082 
   2083         return x;
   2084     }
   2085 };
   2086 
   2087 template <>
   2088 struct cvtScale_SIMD<uchar, int, float>
   2089 {
   2090     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
   2091     {
   2092         int x = 0;
   2093 
   2094         if (!USE_SSE2)
   2095             return x;
   2096 
   2097         __m128i v_zero = _mm_setzero_si128();
   2098         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2099 
   2100         for ( ; x <= width - 8; x += 8)
   2101         {
   2102             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
   2103             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2104             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2105 
   2106             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2107             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2108 
   2109             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
   2110             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
   2111         }
   2112 
   2113         return x;
   2114     }
   2115 };
   2116 
   2117 template <>
   2118 struct cvtScale_SIMD<uchar, float, float>
   2119 {
   2120     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
   2121     {
   2122         int x = 0;
   2123 
   2124         if (!USE_SSE2)
   2125             return x;
   2126 
   2127         __m128i v_zero = _mm_setzero_si128();
   2128         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2129 
   2130         for ( ; x <= width - 8; x += 8)
   2131         {
   2132             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
   2133             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2134             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2135 
   2136             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2137             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2138 
   2139             _mm_storeu_ps(dst + x, v_dst_0);
   2140             _mm_storeu_ps(dst + x + 4, v_dst_1);
   2141         }
   2142 
   2143         return x;
   2144     }
   2145 };
   2146 
   2147 template <>
   2148 struct cvtScale_SIMD<uchar, double, double>
   2149 {
   2150     int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
   2151     {
   2152         int x = 0;
   2153 
   2154         if (!USE_SSE2)
   2155             return x;
   2156 
   2157         __m128i v_zero = _mm_setzero_si128();
   2158         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   2159 
   2160         for ( ; x <= width - 8; x += 8)
   2161         {
   2162             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
   2163 
   2164             __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
   2165             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
   2166             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
   2167             _mm_storeu_pd(dst + x, v_dst_0);
   2168             _mm_storeu_pd(dst + x + 2, v_dst_1);
   2169 
   2170             v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
   2171             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
   2172             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
   2173             _mm_storeu_pd(dst + x + 4, v_dst_0);
   2174             _mm_storeu_pd(dst + x + 6, v_dst_1);
   2175         }
   2176 
   2177         return x;
   2178     }
   2179 };
   2180 
   2181 // from schar
   2182 
   2183 template <>
   2184 struct cvtScale_SIMD<schar, uchar, float>
   2185 {
   2186     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
   2187     {
   2188         int x = 0;
   2189 
   2190         if (!USE_SSE2)
   2191             return x;
   2192 
   2193         __m128i v_zero = _mm_setzero_si128();
   2194         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2195 
   2196         for ( ; x <= width - 8; x += 8)
   2197         {
   2198             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
   2199             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2200             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2201 
   2202             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2203             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2204 
   2205             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2206                                             _mm_cvtps_epi32(v_dst_1));
   2207             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
   2208         }
   2209 
   2210         return x;
   2211     }
   2212 };
   2213 
   2214 template <>
   2215 struct cvtScale_SIMD<schar, schar, float>
   2216 {
   2217     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
   2218     {
   2219         int x = 0;
   2220 
   2221         if (!USE_SSE2)
   2222             return x;
   2223 
   2224         __m128i v_zero = _mm_setzero_si128();
   2225         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2226 
   2227         for ( ; x <= width - 8; x += 8)
   2228         {
   2229             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
   2230             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2231             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2232 
   2233             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2234             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2235 
   2236             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2237                                             _mm_cvtps_epi32(v_dst_1));
   2238             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
   2239         }
   2240 
   2241         return x;
   2242     }
   2243 };
   2244 
   2245 #if CV_SSE4_1
   2246 
   2247 template <>
   2248 struct cvtScale_SIMD<schar, ushort, float>
   2249 {
   2250     cvtScale_SIMD()
   2251     {
   2252         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
   2253     }
   2254 
   2255     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
   2256     {
   2257         int x = 0;
   2258 
   2259         if (!haveSSE)
   2260             return x;
   2261 
   2262         __m128i v_zero = _mm_setzero_si128();
   2263         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2264 
   2265         for ( ; x <= width - 8; x += 8)
   2266         {
   2267             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
   2268             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2269             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2270 
   2271             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2272             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2273 
   2274             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
   2275                                              _mm_cvtps_epi32(v_dst_1));
   2276             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   2277         }
   2278 
   2279         return x;
   2280     }
   2281 
   2282     bool haveSSE;
   2283 };
   2284 
   2285 #endif
   2286 
   2287 template <>
   2288 struct cvtScale_SIMD<schar, short, float>
   2289 {
   2290     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
   2291     {
   2292         int x = 0;
   2293 
   2294         if (!USE_SSE2)
   2295             return x;
   2296 
   2297         __m128i v_zero = _mm_setzero_si128();
   2298         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2299 
   2300         for ( ; x <= width - 8; x += 8)
   2301         {
   2302             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
   2303             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2304             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2305 
   2306             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2307             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2308 
   2309             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2310                                             _mm_cvtps_epi32(v_dst_1));
   2311             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   2312         }
   2313 
   2314         return x;
   2315     }
   2316 };
   2317 
   2318 template <>
   2319 struct cvtScale_SIMD<schar, int, float>
   2320 {
   2321     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
   2322     {
   2323         int x = 0;
   2324 
   2325         if (!USE_SSE2)
   2326             return x;
   2327 
   2328         __m128i v_zero = _mm_setzero_si128();
   2329         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2330 
   2331         for ( ; x <= width - 8; x += 8)
   2332         {
   2333             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
   2334             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2335             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2336 
   2337             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2338             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2339 
   2340             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
   2341             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
   2342         }
   2343 
   2344         return x;
   2345     }
   2346 };
   2347 
   2348 template <>
   2349 struct cvtScale_SIMD<schar, float, float>
   2350 {
   2351     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
   2352     {
   2353         int x = 0;
   2354 
   2355         if (!USE_SSE2)
   2356             return x;
   2357 
   2358         __m128i v_zero = _mm_setzero_si128();
   2359         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2360 
   2361         for ( ; x <= width - 8; x += 8)
   2362         {
   2363             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
   2364             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2365             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2366 
   2367             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2368             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2369 
   2370             _mm_storeu_ps(dst + x, v_dst_0);
   2371             _mm_storeu_ps(dst + x + 4, v_dst_1);
   2372         }
   2373 
   2374         return x;
   2375     }
   2376 };
   2377 
   2378 template <>
   2379 struct cvtScale_SIMD<schar, double, double>
   2380 {
   2381     int operator () (const schar * src, double * dst, int width, double scale, double shift) const
   2382     {
   2383         int x = 0;
   2384 
   2385         if (!USE_SSE2)
   2386             return x;
   2387 
   2388         __m128i v_zero = _mm_setzero_si128();
   2389         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   2390 
   2391         for ( ; x <= width - 8; x += 8)
   2392         {
   2393             __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x)));
   2394             v_src = _mm_srai_epi16(v_src, 8);
   2395 
   2396             __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
   2397             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
   2398             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
   2399             _mm_storeu_pd(dst + x, v_dst_0);
   2400             _mm_storeu_pd(dst + x + 2, v_dst_1);
   2401 
   2402             v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
   2403             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
   2404             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
   2405             _mm_storeu_pd(dst + x + 4, v_dst_0);
   2406             _mm_storeu_pd(dst + x + 6, v_dst_1);
   2407         }
   2408 
   2409         return x;
   2410     }
   2411 };
   2412 
   2413 // from ushort
   2414 
   2415 template <>
   2416 struct cvtScale_SIMD<ushort, uchar, float>
   2417 {
   2418     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
   2419     {
   2420         int x = 0;
   2421 
   2422         if (!USE_SSE2)
   2423             return x;
   2424 
   2425         __m128i v_zero = _mm_setzero_si128();
   2426         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2427 
   2428         for ( ; x <= width - 8; x += 8)
   2429         {
   2430             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2431             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2432             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2433 
   2434             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2435             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2436 
   2437             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2438                                             _mm_cvtps_epi32(v_dst_1));
   2439             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
   2440         }
   2441 
   2442         return x;
   2443     }
   2444 };
   2445 
   2446 template <>
   2447 struct cvtScale_SIMD<ushort, schar, float>
   2448 {
   2449     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
   2450     {
   2451         int x = 0;
   2452 
   2453         if (!USE_SSE2)
   2454             return x;
   2455 
   2456         __m128i v_zero = _mm_setzero_si128();
   2457         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2458 
   2459         for ( ; x <= width - 8; x += 8)
   2460         {
   2461             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2462             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2463             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2464 
   2465             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2466             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2467 
   2468             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2469                                             _mm_cvtps_epi32(v_dst_1));
   2470             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
   2471         }
   2472 
   2473         return x;
   2474     }
   2475 };
   2476 
   2477 #if CV_SSE4_1
   2478 
   2479 template <>
   2480 struct cvtScale_SIMD<ushort, ushort, float>
   2481 {
   2482     cvtScale_SIMD()
   2483     {
   2484         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
   2485     }
   2486 
   2487     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
   2488     {
   2489         int x = 0;
   2490 
   2491         if (!haveSSE)
   2492             return x;
   2493 
   2494         __m128i v_zero = _mm_setzero_si128();
   2495         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2496 
   2497         for ( ; x <= width - 8; x += 8)
   2498         {
   2499             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2500             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2501             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2502 
   2503             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2504             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2505 
   2506             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
   2507                                              _mm_cvtps_epi32(v_dst_1));
   2508             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   2509         }
   2510 
   2511         return x;
   2512     }
   2513 
   2514     bool haveSSE;
   2515 };
   2516 
   2517 #endif
   2518 
   2519 template <>
   2520 struct cvtScale_SIMD<ushort, short, float>
   2521 {
   2522     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
   2523     {
   2524         int x = 0;
   2525 
   2526         if (!USE_SSE2)
   2527             return x;
   2528 
   2529         __m128i v_zero = _mm_setzero_si128();
   2530         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2531 
   2532         for ( ; x <= width - 8; x += 8)
   2533         {
   2534             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2535             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2536             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2537 
   2538             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2539             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2540 
   2541             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2542                                             _mm_cvtps_epi32(v_dst_1));
   2543             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   2544         }
   2545 
   2546         return x;
   2547     }
   2548 };
   2549 
   2550 template <>
   2551 struct cvtScale_SIMD<ushort, int, float>
   2552 {
   2553     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
   2554     {
   2555         int x = 0;
   2556 
   2557         if (!USE_SSE2)
   2558             return x;
   2559 
   2560         __m128i v_zero = _mm_setzero_si128();
   2561         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2562 
   2563         for ( ; x <= width - 8; x += 8)
   2564         {
   2565             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2566             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2567             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2568 
   2569             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2570             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2571 
   2572             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
   2573             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
   2574         }
   2575 
   2576         return x;
   2577     }
   2578 };
   2579 
   2580 template <>
   2581 struct cvtScale_SIMD<ushort, float, float>
   2582 {
   2583     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
   2584     {
   2585         int x = 0;
   2586 
   2587         if (!USE_SSE2)
   2588             return x;
   2589 
   2590         __m128i v_zero = _mm_setzero_si128();
   2591         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2592 
   2593         for ( ; x <= width - 8; x += 8)
   2594         {
   2595             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2596             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
   2597             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2598 
   2599             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
   2600             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2601 
   2602             _mm_storeu_ps(dst + x, v_dst_0);
   2603             _mm_storeu_ps(dst + x + 4, v_dst_1);
   2604         }
   2605 
   2606         return x;
   2607     }
   2608 };
   2609 
   2610 template <>
   2611 struct cvtScale_SIMD<ushort, double, double>
   2612 {
   2613     int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
   2614     {
   2615         int x = 0;
   2616 
   2617         if (!USE_SSE2)
   2618             return x;
   2619 
   2620         __m128i v_zero = _mm_setzero_si128();
   2621         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   2622 
   2623         for ( ; x <= width - 8; x += 8)
   2624         {
   2625             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2626 
   2627             __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
   2628             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
   2629             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
   2630             _mm_storeu_pd(dst + x, v_dst_0);
   2631             _mm_storeu_pd(dst + x + 2, v_dst_1);
   2632 
   2633             v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
   2634             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
   2635             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
   2636             _mm_storeu_pd(dst + x + 4, v_dst_0);
   2637             _mm_storeu_pd(dst + x + 6, v_dst_1);
   2638         }
   2639 
   2640         return x;
   2641     }
   2642 };
   2643 
   2644 // from short
   2645 
   2646 template <>
   2647 struct cvtScale_SIMD<short, uchar, float>
   2648 {
   2649     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
   2650     {
   2651         int x = 0;
   2652 
   2653         if (!USE_SSE2)
   2654             return x;
   2655 
   2656         __m128i v_zero = _mm_setzero_si128();
   2657         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2658 
   2659         for ( ; x <= width - 8; x += 8)
   2660         {
   2661             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2662             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2663             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2664 
   2665             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2666             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2667 
   2668             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2669                                             _mm_cvtps_epi32(v_dst_1));
   2670             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
   2671         }
   2672 
   2673         return x;
   2674     }
   2675 };
   2676 
   2677 template <>
   2678 struct cvtScale_SIMD<short, schar, float>
   2679 {
   2680     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
   2681     {
   2682         int x = 0;
   2683 
   2684         if (!USE_SSE2)
   2685             return x;
   2686 
   2687         __m128i v_zero = _mm_setzero_si128();
   2688         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2689 
   2690         for ( ; x <= width - 8; x += 8)
   2691         {
   2692             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2693             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2694             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2695 
   2696             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2697             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2698 
   2699             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2700                                             _mm_cvtps_epi32(v_dst_1));
   2701             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
   2702         }
   2703 
   2704         return x;
   2705     }
   2706 };
   2707 
   2708 #if CV_SSE4_1
   2709 
   2710 template <>
   2711 struct cvtScale_SIMD<short, ushort, float>
   2712 {
   2713     cvtScale_SIMD()
   2714     {
   2715         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
   2716     }
   2717 
   2718     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
   2719     {
   2720         int x = 0;
   2721 
   2722         if (!haveSSE)
   2723             return x;
   2724 
   2725         __m128i v_zero = _mm_setzero_si128();
   2726         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2727 
   2728         for ( ; x <= width - 8; x += 8)
   2729         {
   2730             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2731             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2732             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2733 
   2734             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2735             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2736 
   2737             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
   2738                                              _mm_cvtps_epi32(v_dst_1));
   2739             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   2740         }
   2741 
   2742         return x;
   2743     }
   2744 
   2745     bool haveSSE;
   2746 };
   2747 
   2748 #endif
   2749 
   2750 template <>
   2751 struct cvtScale_SIMD<short, short, float>
   2752 {
   2753     int operator () (const short * src, short * dst, int width, float scale, float shift) const
   2754     {
   2755         int x = 0;
   2756 
   2757         if (!USE_SSE2)
   2758             return x;
   2759 
   2760         __m128i v_zero = _mm_setzero_si128();
   2761         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2762 
   2763         for ( ; x <= width - 8; x += 8)
   2764         {
   2765             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2766             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2767             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2768 
   2769             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2770             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2771 
   2772             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2773                                             _mm_cvtps_epi32(v_dst_1));
   2774             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   2775         }
   2776 
   2777         return x;
   2778     }
   2779 };
   2780 
   2781 template <>
   2782 struct cvtScale_SIMD<short, int, float>
   2783 {
   2784     int operator () (const short * src, int * dst, int width, float scale, float shift) const
   2785     {
   2786         int x = 0;
   2787 
   2788         if (!USE_SSE2)
   2789             return x;
   2790 
   2791         __m128i v_zero = _mm_setzero_si128();
   2792         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2793 
   2794         for ( ; x <= width - 8; x += 8)
   2795         {
   2796             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2797             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2798             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2799 
   2800             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2801             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2802 
   2803             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
   2804             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
   2805         }
   2806 
   2807         return x;
   2808     }
   2809 };
   2810 
   2811 template <>
   2812 struct cvtScale_SIMD<short, float, float>
   2813 {
   2814     int operator () (const short * src, float * dst, int width, float scale, float shift) const
   2815     {
   2816         int x = 0;
   2817 
   2818         if (!USE_SSE2)
   2819             return x;
   2820 
   2821         __m128i v_zero = _mm_setzero_si128();
   2822         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2823 
   2824         for ( ; x <= width - 8; x += 8)
   2825         {
   2826             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2827             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
   2828             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2829 
   2830             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
   2831             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
   2832 
   2833             _mm_storeu_ps(dst + x, v_dst_0);
   2834             _mm_storeu_ps(dst + x + 4, v_dst_1);
   2835         }
   2836 
   2837         return x;
   2838     }
   2839 };
   2840 
   2841 template <>
   2842 struct cvtScale_SIMD<short, double, double>
   2843 {
   2844     int operator () (const short * src, double * dst, int width, double scale, double shift) const
   2845     {
   2846         int x = 0;
   2847 
   2848         if (!USE_SSE2)
   2849             return x;
   2850 
   2851         __m128i v_zero = _mm_setzero_si128();
   2852         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   2853 
   2854         for ( ; x <= width - 8; x += 8)
   2855         {
   2856             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2857 
   2858             __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
   2859             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
   2860             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
   2861             _mm_storeu_pd(dst + x, v_dst_0);
   2862             _mm_storeu_pd(dst + x + 2, v_dst_1);
   2863 
   2864             v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
   2865             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
   2866             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
   2867             _mm_storeu_pd(dst + x + 4, v_dst_0);
   2868             _mm_storeu_pd(dst + x + 6, v_dst_1);
   2869         }
   2870 
   2871         return x;
   2872     }
   2873 };
   2874 
   2875 // from int
   2876 
   2877 template <>
   2878 struct cvtScale_SIMD<int, uchar, float>
   2879 {
   2880     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
   2881     {
   2882         int x = 0;
   2883 
   2884         if (!USE_SSE2)
   2885             return x;
   2886 
   2887         __m128i v_zero = _mm_setzero_si128();
   2888         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2889 
   2890         for ( ; x <= width - 8; x += 8)
   2891         {
   2892             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2893             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
   2894 
   2895             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
   2896             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
   2897 
   2898             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2899                                             _mm_cvtps_epi32(v_dst_1));
   2900             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
   2901         }
   2902 
   2903         return x;
   2904     }
   2905 };
   2906 
   2907 template <>
   2908 struct cvtScale_SIMD<int, schar, float>
   2909 {
   2910     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
   2911     {
   2912         int x = 0;
   2913 
   2914         if (!USE_SSE2)
   2915             return x;
   2916 
   2917         __m128i v_zero = _mm_setzero_si128();
   2918         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2919 
   2920         for ( ; x <= width - 8; x += 8)
   2921         {
   2922             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2923             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
   2924 
   2925             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
   2926             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
   2927 
   2928             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2929                                             _mm_cvtps_epi32(v_dst_1));
   2930             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
   2931         }
   2932 
   2933         return x;
   2934     }
   2935 };
   2936 
   2937 #if CV_SSE4_1
   2938 
   2939 template <>
   2940 struct cvtScale_SIMD<int, ushort, float>
   2941 {
   2942     cvtScale_SIMD()
   2943     {
   2944         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
   2945     }
   2946 
   2947     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
   2948     {
   2949         int x = 0;
   2950 
   2951         if (!haveSSE)
   2952             return x;
   2953 
   2954         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2955 
   2956         for ( ; x <= width - 8; x += 8)
   2957         {
   2958             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2959             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
   2960 
   2961             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
   2962             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
   2963 
   2964             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
   2965                                              _mm_cvtps_epi32(v_dst_1));
   2966             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   2967         }
   2968 
   2969         return x;
   2970     }
   2971 
   2972     bool haveSSE;
   2973 };
   2974 
   2975 #endif
   2976 
   2977 template <>
   2978 struct cvtScale_SIMD<int, short, float>
   2979 {
   2980     int operator () (const int * src, short * dst, int width, float scale, float shift) const
   2981     {
   2982         int x = 0;
   2983 
   2984         if (!USE_SSE2)
   2985             return x;
   2986 
   2987         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   2988 
   2989         for ( ; x <= width - 8; x += 8)
   2990         {
   2991             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   2992             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
   2993 
   2994             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
   2995             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
   2996 
   2997             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   2998                                             _mm_cvtps_epi32(v_dst_1));
   2999             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   3000         }
   3001 
   3002         return x;
   3003     }
   3004 };
   3005 
   3006 template <>
   3007 struct cvtScale_SIMD<int, int, double>
   3008 {
   3009     int operator () (const int * src, int * dst, int width, double scale, double shift) const
   3010     {
   3011         int x = 0;
   3012 
   3013         if (!USE_SSE2)
   3014             return x;
   3015 
   3016         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   3017 
   3018         for ( ; x <= width - 4; x += 4)
   3019         {
   3020             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   3021             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
   3022 
   3023             v_src = _mm_srli_si128(v_src, 8);
   3024             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
   3025 
   3026             __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)),
   3027                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1)));
   3028 
   3029             _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
   3030         }
   3031 
   3032         return x;
   3033     }
   3034 };
   3035 
   3036 template <>
   3037 struct cvtScale_SIMD<int, float, double>
   3038 {
   3039     int operator () (const int * src, float * dst, int width, double scale, double shift) const
   3040     {
   3041         int x = 0;
   3042 
   3043         if (!USE_SSE2)
   3044             return x;
   3045 
   3046         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   3047 
   3048         for ( ; x <= width - 4; x += 4)
   3049         {
   3050             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   3051             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
   3052 
   3053             v_src = _mm_srli_si128(v_src, 8);
   3054             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
   3055 
   3056             _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0),
   3057                                                  _mm_cvtpd_ps(v_dst_1)));
   3058         }
   3059 
   3060         return x;
   3061     }
   3062 };
   3063 
   3064 template <>
   3065 struct cvtScale_SIMD<int, double, double>
   3066 {
   3067     int operator () (const int * src, double * dst, int width, double scale, double shift) const
   3068     {
   3069         int x = 0;
   3070 
   3071         if (!USE_SSE2)
   3072             return x;
   3073 
   3074         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   3075 
   3076         for ( ; x <= width - 4; x += 4)
   3077         {
   3078             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
   3079             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
   3080 
   3081             v_src = _mm_srli_si128(v_src, 8);
   3082             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
   3083 
   3084             _mm_storeu_pd(dst + x, v_dst_0);
   3085             _mm_storeu_pd(dst + x + 2, v_dst_1);
   3086         }
   3087 
   3088         return x;
   3089     }
   3090 };
   3091 
   3092 // from float
   3093 
   3094 template <>
   3095 struct cvtScale_SIMD<float, uchar, float>
   3096 {
   3097     int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
   3098     {
   3099         int x = 0;
   3100 
   3101         if (!USE_SSE2)
   3102             return x;
   3103 
   3104         __m128i v_zero = _mm_setzero_si128();
   3105         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3106 
   3107         for ( ; x <= width - 8; x += 8)
   3108         {
   3109             __m128 v_src = _mm_loadu_ps(src + x);
   3110             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3111 
   3112             v_src = _mm_loadu_ps(src + x + 4);
   3113             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3114 
   3115             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   3116                                             _mm_cvtps_epi32(v_dst_1));
   3117             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
   3118         }
   3119 
   3120         return x;
   3121     }
   3122 };
   3123 
   3124 template <>
   3125 struct cvtScale_SIMD<float, schar, float>
   3126 {
   3127     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
   3128     {
   3129         int x = 0;
   3130 
   3131         if (!USE_SSE2)
   3132             return x;
   3133 
   3134         __m128i v_zero = _mm_setzero_si128();
   3135         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3136 
   3137         for ( ; x <= width - 8; x += 8)
   3138         {
   3139             __m128 v_src = _mm_loadu_ps(src + x);
   3140             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3141 
   3142             v_src = _mm_loadu_ps(src + x + 4);
   3143             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3144 
   3145             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   3146                                             _mm_cvtps_epi32(v_dst_1));
   3147             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
   3148         }
   3149 
   3150         return x;
   3151     }
   3152 };
   3153 
   3154 #if CV_SSE4_1
   3155 
   3156 template <>
   3157 struct cvtScale_SIMD<float, ushort, float>
   3158 {
   3159     cvtScale_SIMD()
   3160     {
   3161         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
   3162     }
   3163 
   3164     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
   3165     {
   3166         int x = 0;
   3167 
   3168         if (!haveSSE)
   3169             return x;
   3170 
   3171         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3172 
   3173         for ( ; x <= width - 8; x += 8)
   3174         {
   3175             __m128 v_src = _mm_loadu_ps(src + x);
   3176             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3177 
   3178             v_src = _mm_loadu_ps(src + x + 4);
   3179             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3180 
   3181             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
   3182                                              _mm_cvtps_epi32(v_dst_1));
   3183             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   3184         }
   3185 
   3186         return x;
   3187     }
   3188 
   3189     bool haveSSE;
   3190 };
   3191 
   3192 #endif
   3193 
   3194 template <>
   3195 struct cvtScale_SIMD<float, short, float>
   3196 {
   3197     int operator () (const float * src, short * dst, int width, float scale, float shift) const
   3198     {
   3199         int x = 0;
   3200 
   3201         if (!USE_SSE2)
   3202             return x;
   3203 
   3204         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3205 
   3206         for ( ; x <= width - 8; x += 8)
   3207         {
   3208             __m128 v_src = _mm_loadu_ps(src + x);
   3209             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3210 
   3211             v_src = _mm_loadu_ps(src + x + 4);
   3212             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3213 
   3214             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   3215                                             _mm_cvtps_epi32(v_dst_1));
   3216             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   3217         }
   3218 
   3219         return x;
   3220     }
   3221 };
   3222 
   3223 template <>
   3224 struct cvtScale_SIMD<float, int, float>
   3225 {
   3226     int operator () (const float * src, int * dst, int width, float scale, float shift) const
   3227     {
   3228         int x = 0;
   3229 
   3230         if (!USE_SSE2)
   3231             return x;
   3232 
   3233         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3234 
   3235         for ( ; x <= width - 8; x += 8)
   3236         {
   3237             __m128 v_src = _mm_loadu_ps(src + x);
   3238             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3239 
   3240             v_src = _mm_loadu_ps(src + x + 4);
   3241             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3242 
   3243             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
   3244             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
   3245         }
   3246 
   3247         return x;
   3248     }
   3249 };
   3250 
   3251 template <>
   3252 struct cvtScale_SIMD<float, float, float>
   3253 {
   3254     int operator () (const float * src, float * dst, int width, float scale, float shift) const
   3255     {
   3256         int x = 0;
   3257 
   3258         if (!USE_SSE2)
   3259             return x;
   3260 
   3261         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3262 
   3263         for ( ; x <= width - 4; x += 4)
   3264         {
   3265             __m128 v_src = _mm_loadu_ps(src + x);
   3266             __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3267             _mm_storeu_ps(dst + x, v_dst);
   3268         }
   3269 
   3270         return x;
   3271     }
   3272 };
   3273 
   3274 template <>
   3275 struct cvtScale_SIMD<float, double, double>
   3276 {
   3277     int operator () (const float * src, double * dst, int width, double scale, double shift) const
   3278     {
   3279         int x = 0;
   3280 
   3281         if (!USE_SSE2)
   3282             return x;
   3283 
   3284         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   3285 
   3286         for ( ; x <= width - 4; x += 4)
   3287         {
   3288             __m128 v_src = _mm_loadu_ps(src + x);
   3289             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
   3290             v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
   3291             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
   3292 
   3293             _mm_storeu_pd(dst + x, v_dst_0);
   3294             _mm_storeu_pd(dst + x + 2, v_dst_1);
   3295         }
   3296 
   3297         return x;
   3298     }
   3299 };
   3300 
   3301 // from double
   3302 
   3303 template <>
   3304 struct cvtScale_SIMD<double, uchar, float>
   3305 {
   3306     int operator () (const double * src, uchar * dst, int width, float scale, float shift) const
   3307     {
   3308         int x = 0;
   3309 
   3310         if (!USE_SSE2)
   3311             return x;
   3312 
   3313         __m128i v_zero = _mm_setzero_si128();
   3314         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3315 
   3316         for ( ; x <= width - 8; x += 8)
   3317         {
   3318             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
   3319                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
   3320             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3321 
   3322             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
   3323                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
   3324             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3325 
   3326             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   3327                                             _mm_cvtps_epi32(v_dst_1));
   3328             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
   3329         }
   3330 
   3331         return x;
   3332     }
   3333 };
   3334 
   3335 template <>
   3336 struct cvtScale_SIMD<double, schar, float>
   3337 {
   3338     int operator () (const double * src, schar * dst, int width, float scale, float shift) const
   3339     {
   3340         int x = 0;
   3341 
   3342         if (!USE_SSE2)
   3343             return x;
   3344 
   3345         __m128i v_zero = _mm_setzero_si128();
   3346         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3347 
   3348         for ( ; x <= width - 8; x += 8)
   3349         {
   3350             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
   3351                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
   3352             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3353 
   3354             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
   3355                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
   3356             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3357 
   3358             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   3359                                             _mm_cvtps_epi32(v_dst_1));
   3360             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
   3361         }
   3362 
   3363         return x;
   3364     }
   3365 };
   3366 
   3367 #if CV_SSE4_1
   3368 
   3369 template <>
   3370 struct cvtScale_SIMD<double, ushort, float>
   3371 {
   3372     cvtScale_SIMD()
   3373     {
   3374         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
   3375     }
   3376 
   3377     int operator () (const double * src, ushort * dst, int width, float scale, float shift) const
   3378     {
   3379         int x = 0;
   3380 
   3381         if (!haveSSE)
   3382             return x;
   3383 
   3384         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3385 
   3386         for ( ; x <= width - 8; x += 8)
   3387         {
   3388             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
   3389                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
   3390             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3391 
   3392             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
   3393                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
   3394             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3395 
   3396             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
   3397                                              _mm_cvtps_epi32(v_dst_1));
   3398             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   3399         }
   3400 
   3401         return x;
   3402     }
   3403 
   3404     bool haveSSE;
   3405 };
   3406 
   3407 #endif
   3408 
   3409 template <>
   3410 struct cvtScale_SIMD<double, short, float>
   3411 {
   3412     int operator () (const double * src, short * dst, int width, float scale, float shift) const
   3413     {
   3414         int x = 0;
   3415 
   3416         if (!USE_SSE2)
   3417             return x;
   3418 
   3419         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
   3420 
   3421         for ( ; x <= width - 8; x += 8)
   3422         {
   3423             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
   3424                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
   3425             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3426 
   3427             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
   3428                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
   3429             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
   3430 
   3431             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
   3432                                             _mm_cvtps_epi32(v_dst_1));
   3433             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   3434         }
   3435 
   3436         return x;
   3437     }
   3438 };
   3439 
   3440 template <>
   3441 struct cvtScale_SIMD<double, int, double>
   3442 {
   3443     int operator () (const double * src, int * dst, int width, double scale, double shift) const
   3444     {
   3445         int x = 0;
   3446 
   3447         if (!USE_SSE2)
   3448             return x;
   3449 
   3450         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   3451 
   3452         for ( ; x <= width - 4; x += 4)
   3453         {
   3454             __m128d v_src = _mm_loadu_pd(src + x);
   3455             __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
   3456 
   3457             v_src = _mm_loadu_pd(src + x + 2);
   3458             __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
   3459 
   3460             __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)),
   3461                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1)));
   3462 
   3463             _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
   3464         }
   3465 
   3466         return x;
   3467     }
   3468 };
   3469 
   3470 template <>
   3471 struct cvtScale_SIMD<double, float, double>
   3472 {
   3473     int operator () (const double * src, float * dst, int width, double scale, double shift) const
   3474     {
   3475         int x = 0;
   3476 
   3477         if (!USE_SSE2)
   3478             return x;
   3479 
   3480         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   3481 
   3482         for ( ; x <= width - 4; x += 4)
   3483         {
   3484             __m128d v_src = _mm_loadu_pd(src + x);
   3485             __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
   3486 
   3487             v_src = _mm_loadu_pd(src + x + 2);
   3488             __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
   3489 
   3490             __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0),
   3491                                          _mm_cvtpd_ps(v_dst1));
   3492 
   3493             _mm_storeu_ps(dst + x, v_dst);
   3494         }
   3495 
   3496         return x;
   3497     }
   3498 };
   3499 
   3500 template <>
   3501 struct cvtScale_SIMD<double, double, double>
   3502 {
   3503     int operator () (const double * src, double * dst, int width, double scale, double shift) const
   3504     {
   3505         int x = 0;
   3506 
   3507         if (!USE_SSE2)
   3508             return x;
   3509 
   3510         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
   3511 
   3512         for ( ; x <= width - 2; x += 2)
   3513         {
   3514             __m128d v_src = _mm_loadu_pd(src + x);
   3515             __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
   3516             _mm_storeu_pd(dst + x, v_dst);
   3517         }
   3518 
   3519         return x;
   3520     }
   3521 };
   3522 
   3523 #elif CV_NEON
   3524 
   3525 // from uchar
   3526 
   3527 template <>
   3528 struct cvtScale_SIMD<uchar, uchar, float>
   3529 {
   3530     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
   3531     {
   3532         int x = 0;
   3533         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3534 
   3535         for ( ; x <= width - 8; x += 8)
   3536         {
   3537             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
   3538             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3539             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3540 
   3541             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   3542                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   3543             vst1_u8(dst + x, vqmovn_u16(v_dst));
   3544         }
   3545 
   3546         return x;
   3547     }
   3548 };
   3549 
   3550 template <>
   3551 struct cvtScale_SIMD<uchar, schar, float>
   3552 {
   3553     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
   3554     {
   3555         int x = 0;
   3556         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3557 
   3558         for ( ; x <= width - 8; x += 8)
   3559         {
   3560             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
   3561             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3562             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3563 
   3564             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   3565                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   3566             vst1_s8(dst + x, vqmovn_s16(v_dst));
   3567         }
   3568 
   3569         return x;
   3570     }
   3571 };
   3572 
   3573 template <>
   3574 struct cvtScale_SIMD<uchar, ushort, float>
   3575 {
   3576     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
   3577     {
   3578         int x = 0;
   3579         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3580 
   3581         for ( ; x <= width - 8; x += 8)
   3582         {
   3583             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
   3584             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3585             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3586 
   3587             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   3588                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   3589             vst1q_u16(dst + x, v_dst);
   3590         }
   3591 
   3592         return x;
   3593     }
   3594 };
   3595 
   3596 template <>
   3597 struct cvtScale_SIMD<uchar, short, float>
   3598 {
   3599     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
   3600     {
   3601         int x = 0;
   3602         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3603 
   3604         for ( ; x <= width - 8; x += 8)
   3605         {
   3606             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
   3607             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3608             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3609 
   3610             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   3611                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   3612             vst1q_s16(dst + x, v_dst);
   3613         }
   3614 
   3615         return x;
   3616     }
   3617 };
   3618 
   3619 template <>
   3620 struct cvtScale_SIMD<uchar, int, float>
   3621 {
   3622     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
   3623     {
   3624         int x = 0;
   3625         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3626 
   3627         for ( ; x <= width - 8; x += 8)
   3628         {
   3629             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
   3630             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3631             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3632 
   3633             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
   3634             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
   3635         }
   3636 
   3637         return x;
   3638     }
   3639 };
   3640 
   3641 template <>
   3642 struct cvtScale_SIMD<uchar, float, float>
   3643 {
   3644     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
   3645     {
   3646         int x = 0;
   3647         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3648 
   3649         for ( ; x <= width - 8; x += 8)
   3650         {
   3651             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
   3652             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
   3653             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
   3654         }
   3655 
   3656         return x;
   3657     }
   3658 };
   3659 
   3660 // from schar
   3661 
   3662 template <>
   3663 struct cvtScale_SIMD<schar, uchar, float>
   3664 {
   3665     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
   3666     {
   3667         int x = 0;
   3668         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3669 
   3670         for ( ; x <= width - 8; x += 8)
   3671         {
   3672             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
   3673             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
   3674             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
   3675 
   3676             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   3677                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   3678             vst1_u8(dst + x, vqmovn_u16(v_dst));
   3679         }
   3680 
   3681         return x;
   3682     }
   3683 };
   3684 
   3685 template <>
   3686 struct cvtScale_SIMD<schar, schar, float>
   3687 {
   3688     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
   3689     {
   3690         int x = 0;
   3691         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3692 
   3693         for ( ; x <= width - 8; x += 8)
   3694         {
   3695             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
   3696             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
   3697             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
   3698 
   3699             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   3700                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   3701             vst1_s8(dst + x, vqmovn_s16(v_dst));
   3702         }
   3703 
   3704         return x;
   3705     }
   3706 };
   3707 
   3708 template <>
   3709 struct cvtScale_SIMD<schar, ushort, float>
   3710 {
   3711     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
   3712     {
   3713         int x = 0;
   3714         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3715 
   3716         for ( ; x <= width - 8; x += 8)
   3717         {
   3718             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
   3719             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
   3720             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
   3721 
   3722             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   3723                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   3724             vst1q_u16(dst + x, v_dst);
   3725         }
   3726 
   3727         return x;
   3728     }
   3729 };
   3730 
   3731 template <>
   3732 struct cvtScale_SIMD<schar, short, float>
   3733 {
   3734     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
   3735     {
   3736         int x = 0;
   3737         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3738 
   3739         for ( ; x <= width - 8; x += 8)
   3740         {
   3741             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
   3742             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
   3743             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
   3744 
   3745             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   3746                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   3747             vst1q_s16(dst + x, v_dst);
   3748         }
   3749 
   3750         return x;
   3751     }
   3752 };
   3753 
   3754 template <>
   3755 struct cvtScale_SIMD<schar, int, float>
   3756 {
   3757     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
   3758     {
   3759         int x = 0;
   3760         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3761 
   3762         for ( ; x <= width - 8; x += 8)
   3763         {
   3764             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
   3765             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
   3766             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
   3767 
   3768             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
   3769             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
   3770         }
   3771 
   3772         return x;
   3773     }
   3774 };
   3775 
   3776 template <>
   3777 struct cvtScale_SIMD<schar, float, float>
   3778 {
   3779     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
   3780     {
   3781         int x = 0;
   3782         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3783 
   3784         for ( ; x <= width - 8; x += 8)
   3785         {
   3786             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
   3787             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
   3788             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
   3789         }
   3790 
   3791         return x;
   3792     }
   3793 };
   3794 
   3795 // from ushort
   3796 
   3797 template <>
   3798 struct cvtScale_SIMD<ushort, uchar, float>
   3799 {
   3800     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
   3801     {
   3802         int x = 0;
   3803         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3804 
   3805         for ( ; x <= width - 8; x += 8)
   3806         {
   3807             uint16x8_t v_src = vld1q_u16(src + x);
   3808             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3809             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3810 
   3811             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   3812                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   3813             vst1_u8(dst + x, vqmovn_u16(v_dst));
   3814         }
   3815 
   3816         return x;
   3817     }
   3818 };
   3819 
   3820 template <>
   3821 struct cvtScale_SIMD<ushort, schar, float>
   3822 {
   3823     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
   3824     {
   3825         int x = 0;
   3826         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3827 
   3828         for ( ; x <= width - 8; x += 8)
   3829         {
   3830             uint16x8_t v_src = vld1q_u16(src + x);
   3831             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3832             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3833 
   3834             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   3835                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   3836             vst1_s8(dst + x, vqmovn_s16(v_dst));
   3837         }
   3838 
   3839         return x;
   3840     }
   3841 };
   3842 
   3843 template <>
   3844 struct cvtScale_SIMD<ushort, ushort, float>
   3845 {
   3846     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
   3847     {
   3848         int x = 0;
   3849         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3850 
   3851         for ( ; x <= width - 8; x += 8)
   3852         {
   3853             uint16x8_t v_src = vld1q_u16(src + x);
   3854             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3855             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3856 
   3857             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   3858                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   3859             vst1q_u16(dst + x, v_dst);
   3860         }
   3861 
   3862         return x;
   3863     }
   3864 };
   3865 
   3866 template <>
   3867 struct cvtScale_SIMD<ushort, short, float>
   3868 {
   3869     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
   3870     {
   3871         int x = 0;
   3872         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3873 
   3874         for ( ; x <= width - 8; x += 8)
   3875         {
   3876             uint16x8_t v_src = vld1q_u16(src + x);
   3877             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3878             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3879 
   3880             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   3881                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   3882             vst1q_s16(dst + x, v_dst);
   3883         }
   3884 
   3885         return x;
   3886     }
   3887 };
   3888 
   3889 template <>
   3890 struct cvtScale_SIMD<ushort, int, float>
   3891 {
   3892     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
   3893     {
   3894         int x = 0;
   3895         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3896 
   3897         for ( ; x <= width - 8; x += 8)
   3898         {
   3899             uint16x8_t v_src = vld1q_u16(src + x);
   3900             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
   3901             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
   3902 
   3903             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
   3904             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
   3905         }
   3906 
   3907         return x;
   3908     }
   3909 };
   3910 
   3911 template <>
   3912 struct cvtScale_SIMD<ushort, float, float>
   3913 {
   3914     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
   3915     {
   3916         int x = 0;
   3917         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3918 
   3919         for ( ; x <= width - 8; x += 8)
   3920         {
   3921             uint16x8_t v_src = vld1q_u16(src + x);
   3922             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
   3923             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
   3924         }
   3925 
   3926         return x;
   3927     }
   3928 };
   3929 
   3930 // from short
   3931 
   3932 template <>
   3933 struct cvtScale_SIMD<short, uchar, float>
   3934 {
   3935     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
   3936     {
   3937         int x = 0;
   3938         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3939 
   3940         for ( ; x <= width - 8; x += 8)
   3941         {
   3942             int16x8_t v_src = vld1q_s16(src + x);
   3943             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
   3944             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
   3945 
   3946             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   3947                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   3948             vst1_u8(dst + x, vqmovn_u16(v_dst));
   3949         }
   3950 
   3951         return x;
   3952     }
   3953 };
   3954 
   3955 template <>
   3956 struct cvtScale_SIMD<short, schar, float>
   3957 {
   3958     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
   3959     {
   3960         int x = 0;
   3961         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3962 
   3963         for ( ; x <= width - 8; x += 8)
   3964         {
   3965             int16x8_t v_src = vld1q_s16(src + x);
   3966             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
   3967             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
   3968 
   3969             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   3970                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   3971             vst1_s8(dst + x, vqmovn_s16(v_dst));
   3972         }
   3973 
   3974         return x;
   3975     }
   3976 };
   3977 
   3978 template <>
   3979 struct cvtScale_SIMD<short, ushort, float>
   3980 {
   3981     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
   3982     {
   3983         int x = 0;
   3984         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   3985 
   3986         for ( ; x <= width - 8; x += 8)
   3987         {
   3988             int16x8_t v_src = vld1q_s16(src + x);
   3989             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
   3990             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
   3991 
   3992             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   3993                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   3994             vst1q_u16(dst + x, v_dst);
   3995         }
   3996 
   3997         return x;
   3998     }
   3999 };
   4000 
   4001 template <>
   4002 struct cvtScale_SIMD<short, float, float>
   4003 {
   4004     int operator () (const short * src, float * dst, int width, float scale, float shift) const
   4005     {
   4006         int x = 0;
   4007         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4008 
   4009         for ( ; x <= width - 8; x += 8)
   4010         {
   4011             int16x8_t v_src = vld1q_s16(src + x);
   4012             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
   4013             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
   4014         }
   4015 
   4016         return x;
   4017     }
   4018 };
   4019 
   4020 // from int
   4021 
   4022 template <>
   4023 struct cvtScale_SIMD<int, uchar, float>
   4024 {
   4025     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
   4026     {
   4027         int x = 0;
   4028         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4029 
   4030         for ( ; x <= width - 8; x += 8)
   4031         {
   4032             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
   4033             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
   4034 
   4035             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   4036                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   4037             vst1_u8(dst + x, vqmovn_u16(v_dst));
   4038         }
   4039 
   4040         return x;
   4041     }
   4042 };
   4043 
   4044 template <>
   4045 struct cvtScale_SIMD<int, schar, float>
   4046 {
   4047     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
   4048     {
   4049         int x = 0;
   4050         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4051 
   4052         for ( ; x <= width - 8; x += 8)
   4053         {
   4054             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
   4055             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
   4056 
   4057             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   4058                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   4059             vst1_s8(dst + x, vqmovn_s16(v_dst));
   4060         }
   4061 
   4062         return x;
   4063     }
   4064 };
   4065 
   4066 template <>
   4067 struct cvtScale_SIMD<int, ushort, float>
   4068 {
   4069     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
   4070     {
   4071         int x = 0;
   4072         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4073 
   4074         for ( ; x <= width - 8; x += 8)
   4075         {
   4076             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
   4077             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
   4078 
   4079             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   4080                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   4081             vst1q_u16(dst + x, v_dst);
   4082         }
   4083 
   4084         return x;
   4085     }
   4086 };
   4087 
   4088 template <>
   4089 struct cvtScale_SIMD<int, short, float>
   4090 {
   4091     int operator () (const int * src, short * dst, int width, float scale, float shift) const
   4092     {
   4093         int x = 0;
   4094         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4095 
   4096         for ( ; x <= width - 8; x += 8)
   4097         {
   4098             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
   4099             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
   4100 
   4101             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   4102                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   4103             vst1q_s16(dst + x, v_dst);
   4104         }
   4105 
   4106         return x;
   4107     }
   4108 };
   4109 
   4110 // from float
   4111 
   4112 template <>
   4113 struct cvtScale_SIMD<float, uchar, float>
   4114 {
   4115     int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
   4116     {
   4117         int x = 0;
   4118         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4119 
   4120         for ( ; x <= width - 8; x += 8)
   4121         {
   4122             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
   4123             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
   4124 
   4125             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   4126                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   4127             vst1_u8(dst + x, vqmovn_u16(v_dst));
   4128         }
   4129 
   4130         return x;
   4131     }
   4132 };
   4133 
   4134 template <>
   4135 struct cvtScale_SIMD<float, schar, float>
   4136 {
   4137     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
   4138     {
   4139         int x = 0;
   4140         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4141 
   4142         for ( ; x <= width - 8; x += 8)
   4143         {
   4144             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
   4145             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
   4146 
   4147             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   4148                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   4149             vst1_s8(dst + x, vqmovn_s16(v_dst));
   4150         }
   4151 
   4152         return x;
   4153     }
   4154 };
   4155 
   4156 template <>
   4157 struct cvtScale_SIMD<float, ushort, float>
   4158 {
   4159     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
   4160     {
   4161         int x = 0;
   4162         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4163 
   4164         for ( ; x <= width - 8; x += 8)
   4165         {
   4166             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
   4167             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
   4168 
   4169             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
   4170                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
   4171             vst1q_u16(dst + x, v_dst);
   4172         }
   4173 
   4174         return x;
   4175     }
   4176 };
   4177 
   4178 template <>
   4179 struct cvtScale_SIMD<float, short, float>
   4180 {
   4181     int operator () (const float * src, short * dst, int width, float scale, float shift) const
   4182     {
   4183         int x = 0;
   4184         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4185 
   4186         for ( ; x <= width - 8; x += 8)
   4187         {
   4188             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
   4189             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
   4190 
   4191             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
   4192                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
   4193             vst1q_s16(dst + x, v_dst);
   4194         }
   4195 
   4196         return x;
   4197     }
   4198 };
   4199 
   4200 template <>
   4201 struct cvtScale_SIMD<float, int, float>
   4202 {
   4203     int operator () (const float * src, int * dst, int width, float scale, float shift) const
   4204     {
   4205         int x = 0;
   4206         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4207 
   4208         for ( ; x <= width - 4; x += 4)
   4209             vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
   4210 
   4211         return x;
   4212     }
   4213 };
   4214 
   4215 template <>
   4216 struct cvtScale_SIMD<float, float, float>
   4217 {
   4218     int operator () (const float * src, float * dst, int width, float scale, float shift) const
   4219     {
   4220         int x = 0;
   4221         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
   4222 
   4223         for ( ; x <= width - 4; x += 4)
   4224             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift));
   4225 
   4226         return x;
   4227     }
   4228 };
   4229 
   4230 #endif
   4231 
   4232 template<typename T, typename DT, typename WT> static void
   4233 cvtScale_( const T* src, size_t sstep,
   4234            DT* dst, size_t dstep, Size size,
   4235            WT scale, WT shift )
   4236 {
   4237     sstep /= sizeof(src[0]);
   4238     dstep /= sizeof(dst[0]);
   4239 
   4240     cvtScale_SIMD<T, DT, WT> vop;
   4241 
   4242     for( ; size.height--; src += sstep, dst += dstep )
   4243     {
   4244         int x = vop(src, dst, size.width, scale, shift);
   4245 
   4246         #if CV_ENABLE_UNROLLED
   4247         for( ; x <= size.width - 4; x += 4 )
   4248         {
   4249             DT t0, t1;
   4250             t0 = saturate_cast<DT>(src[x]*scale + shift);
   4251             t1 = saturate_cast<DT>(src[x+1]*scale + shift);
   4252             dst[x] = t0; dst[x+1] = t1;
   4253             t0 = saturate_cast<DT>(src[x+2]*scale + shift);
   4254             t1 = saturate_cast<DT>(src[x+3]*scale + shift);
   4255             dst[x+2] = t0; dst[x+3] = t1;
   4256         }
   4257         #endif
   4258 
   4259         for( ; x < size.width; x++ )
   4260             dst[x] = saturate_cast<DT>(src[x]*scale + shift);
   4261     }
   4262 }
   4263 
   4264 //vz optimized template specialization
   4265 template<> void
   4266 cvtScale_<short, short, float>( const short* src, size_t sstep,
   4267            short* dst, size_t dstep, Size size,
   4268            float scale, float shift )
   4269 {
   4270     sstep /= sizeof(src[0]);
   4271     dstep /= sizeof(dst[0]);
   4272 
   4273     for( ; size.height--; src += sstep, dst += dstep )
   4274     {
   4275         int x = 0;
   4276         #if CV_SSE2
   4277             if(USE_SSE2)
   4278             {
   4279                 __m128 scale128 = _mm_set1_ps (scale);
   4280                 __m128 shift128 = _mm_set1_ps (shift);
   4281                 for(; x <= size.width - 8; x += 8 )
   4282                 {
   4283                     __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
   4284                     __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
   4285                     __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
   4286                     __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
   4287                     rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
   4288                     rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
   4289                     r0 = _mm_cvtps_epi32(rf0);
   4290                     r1 = _mm_cvtps_epi32(rf1);
   4291                     r0 = _mm_packs_epi32(r0, r1);
   4292                     _mm_storeu_si128((__m128i*)(dst + x), r0);
   4293                 }
   4294             }
   4295         #elif CV_NEON
   4296         float32x4_t v_shift = vdupq_n_f32(shift);
   4297         for(; x <= size.width - 8; x += 8 )
   4298         {
   4299             int16x8_t v_src = vld1q_s16(src + x);
   4300             float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
   4301             float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
   4302 
   4303             v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
   4304             v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
   4305 
   4306             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)),
   4307                                             vqmovn_s32(cv_vrndq_s32_f32(v_tmp2))));
   4308         }
   4309         #endif
   4310 
   4311         for(; x < size.width; x++ )
   4312             dst[x] = saturate_cast<short>(src[x]*scale + shift);
   4313     }
   4314 }
   4315 
   4316 template<> void
   4317 cvtScale_<short, int, float>( const short* src, size_t sstep,
   4318            int* dst, size_t dstep, Size size,
   4319            float scale, float shift )
   4320 {
   4321     sstep /= sizeof(src[0]);
   4322     dstep /= sizeof(dst[0]);
   4323 
   4324     for( ; size.height--; src += sstep, dst += dstep )
   4325     {
   4326         int x = 0;
   4327 
   4328         #if CV_AVX2
   4329         if (USE_AVX2)
   4330         {
   4331             __m256 scale256 = _mm256_set1_ps(scale);
   4332             __m256 shift256 = _mm256_set1_ps(shift);
   4333             const int shuffle = 0xD8;
   4334 
   4335             for ( ; x <= size.width - 16; x += 16)
   4336             {
   4337                 __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
   4338                 v_src = _mm256_permute4x64_epi64(v_src, shuffle);
   4339                 __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
   4340                 __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
   4341                 __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
   4342                 __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
   4343                 _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
   4344                 _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
   4345             }
   4346         }
   4347         #endif
   4348         #if CV_SSE2
   4349         if (USE_SSE2)//~5X
   4350         {
   4351             __m128 scale128 = _mm_set1_ps (scale);
   4352             __m128 shift128 = _mm_set1_ps (shift);
   4353             for(; x <= size.width - 8; x += 8 )
   4354             {
   4355                 __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x));
   4356 
   4357                 __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
   4358                 __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16));
   4359                 rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
   4360                 rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
   4361 
   4362                 _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0));
   4363                 _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1));
   4364             }
   4365         }
   4366         #elif CV_NEON
   4367         float32x4_t v_shift = vdupq_n_f32(shift);
   4368         for(; x <= size.width - 8; x += 8 )
   4369         {
   4370             int16x8_t v_src = vld1q_s16(src + x);
   4371             float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
   4372             float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
   4373 
   4374             v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
   4375             v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
   4376 
   4377             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1));
   4378             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2));
   4379         }
   4380         #endif
   4381 
   4382         for(; x < size.width; x++ )
   4383             dst[x] = saturate_cast<int>(src[x]*scale + shift);
   4384     }
   4385 }
   4386 
   4387 template <typename T, typename DT>
   4388 struct Cvt_SIMD
   4389 {
   4390     int operator() (const T *, DT *, int) const
   4391     {
   4392         return 0;
   4393     }
   4394 };
   4395 
   4396 #if CV_SSE2
   4397 
   4398 // from double
   4399 
   4400 template <>
   4401 struct Cvt_SIMD<double, uchar>
   4402 {
   4403     int operator() (const double * src, uchar * dst, int width) const
   4404     {
   4405         int x = 0;
   4406 
   4407         if (!USE_SSE2)
   4408             return x;
   4409 
   4410         for ( ; x <= width - 8; x += 8)
   4411         {
   4412             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
   4413             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
   4414             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
   4415             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
   4416 
   4417             v_src0 = _mm_movelh_ps(v_src0, v_src1);
   4418             v_src1 = _mm_movelh_ps(v_src2, v_src3);
   4419 
   4420             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
   4421                                             _mm_cvtps_epi32(v_src1));
   4422             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst));
   4423         }
   4424 
   4425         return x;
   4426     }
   4427 };
   4428 
   4429 template <>
   4430 struct Cvt_SIMD<double, schar>
   4431 {
   4432     int operator() (const double * src, schar * dst, int width) const
   4433     {
   4434         int x = 0;
   4435 
   4436         if (!USE_SSE2)
   4437             return x;
   4438 
   4439         for ( ; x <= width - 8; x += 8)
   4440         {
   4441             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
   4442             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
   4443             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
   4444             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
   4445 
   4446             v_src0 = _mm_movelh_ps(v_src0, v_src1);
   4447             v_src1 = _mm_movelh_ps(v_src2, v_src3);
   4448 
   4449             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
   4450                                             _mm_cvtps_epi32(v_src1));
   4451             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst));
   4452         }
   4453 
   4454         return x;
   4455     }
   4456 };
   4457 
   4458 #if CV_SSE4_1
   4459 
   4460 template <>
   4461 struct Cvt_SIMD<double, ushort>
   4462 {
   4463     bool haveSIMD;
   4464     Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
   4465 
   4466     int operator() (const double * src, ushort * dst, int width) const
   4467     {
   4468         int x = 0;
   4469 
   4470         if (!haveSIMD)
   4471             return x;
   4472 
   4473         for ( ; x <= width - 8; x += 8)
   4474         {
   4475             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
   4476             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
   4477             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
   4478             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
   4479 
   4480             v_src0 = _mm_movelh_ps(v_src0, v_src1);
   4481             v_src1 = _mm_movelh_ps(v_src2, v_src3);
   4482 
   4483             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0),
   4484                                              _mm_cvtps_epi32(v_src1));
   4485             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   4486         }
   4487 
   4488         return x;
   4489     }
   4490 };
   4491 
   4492 #endif // CV_SSE4_1
   4493 
   4494 template <>
   4495 struct Cvt_SIMD<double, short>
   4496 {
   4497     int operator() (const double * src, short * dst, int width) const
   4498     {
   4499         int x = 0;
   4500 
   4501         if (!USE_SSE2)
   4502             return x;
   4503 
   4504         for ( ; x <= width - 8; x += 8)
   4505         {
   4506             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
   4507             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
   4508             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
   4509             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
   4510 
   4511             v_src0 = _mm_movelh_ps(v_src0, v_src1);
   4512             v_src1 = _mm_movelh_ps(v_src2, v_src3);
   4513 
   4514             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
   4515                                             _mm_cvtps_epi32(v_src1));
   4516             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
   4517         }
   4518 
   4519         return x;
   4520     }
   4521 };
   4522 
   4523 template <>
   4524 struct Cvt_SIMD<double, int>
   4525 {
   4526     int operator() (const double * src, int * dst, int width) const
   4527     {
   4528         int x = 0;
   4529 
   4530         if (!USE_SSE2)
   4531             return x;
   4532 
   4533         for ( ; x <= width - 4; x += 4)
   4534         {
   4535             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
   4536             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
   4537             v_src0 = _mm_movelh_ps(v_src0, v_src1);
   4538 
   4539             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0));
   4540         }
   4541 
   4542         return x;
   4543     }
   4544 };
   4545 
   4546 template <>
   4547 struct Cvt_SIMD<double, float>
   4548 {
   4549     int operator() (const double * src, float * dst, int width) const
   4550     {
   4551         int x = 0;
   4552 
   4553         if (!USE_SSE2)
   4554             return x;
   4555 
   4556         for ( ; x <= width - 4; x += 4)
   4557         {
   4558             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
   4559             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
   4560 
   4561             _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1));
   4562         }
   4563 
   4564         return x;
   4565     }
   4566 };
   4567 
   4568 
   4569 #elif CV_NEON
   4570 
   4571 // from uchar
   4572 
   4573 template <>
   4574 struct Cvt_SIMD<uchar, schar>
   4575 {
   4576     int operator() (const uchar * src, schar * dst, int width) const
   4577     {
   4578         int x = 0;
   4579 
   4580         for ( ; x <= width - 8; x += 8)
   4581             vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))));
   4582 
   4583         return x;
   4584     }
   4585 };
   4586 
   4587 
   4588 template <>
   4589 struct Cvt_SIMD<uchar, ushort>
   4590 {
   4591     int operator() (const uchar * src, ushort * dst, int width) const
   4592     {
   4593         int x = 0;
   4594 
   4595         for ( ; x <= width - 8; x += 8)
   4596             vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x)));
   4597 
   4598         return x;
   4599     }
   4600 };
   4601 
   4602 template <>
   4603 struct Cvt_SIMD<uchar, short>
   4604 {
   4605     int operator() (const uchar * src, short * dst, int width) const
   4606     {
   4607         int x = 0;
   4608 
   4609         for ( ; x <= width - 8; x += 8)
   4610             vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))));
   4611 
   4612         return x;
   4613     }
   4614 };
   4615 
   4616 template <>
   4617 struct Cvt_SIMD<uchar, int>
   4618 {
   4619     int operator() (const uchar * src, int * dst, int width) const
   4620     {
   4621         int x = 0;
   4622 
   4623         for ( ; x <= width - 8; x += 8)
   4624         {
   4625             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
   4626             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
   4627             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
   4628         }
   4629 
   4630         return x;
   4631     }
   4632 };
   4633 
   4634 template <>
   4635 struct Cvt_SIMD<uchar, float>
   4636 {
   4637     int operator() (const uchar * src, float * dst, int width) const
   4638     {
   4639         int x = 0;
   4640 
   4641         for ( ; x <= width - 8; x += 8)
   4642         {
   4643             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
   4644             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
   4645             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
   4646         }
   4647 
   4648         return x;
   4649     }
   4650 };
   4651 
   4652 // from schar
   4653 
   4654 template <>
   4655 struct Cvt_SIMD<schar, uchar>
   4656 {
   4657     int operator() (const schar * src, uchar * dst, int width) const
   4658     {
   4659         int x = 0;
   4660 
   4661         for ( ; x <= width - 8; x += 8)
   4662             vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x))));
   4663 
   4664         return x;
   4665     }
   4666 };
   4667 
   4668 template <>
   4669 struct Cvt_SIMD<schar, short>
   4670 {
   4671     int operator() (const schar * src, short * dst, int width) const
   4672     {
   4673         int x = 0;
   4674 
   4675         for ( ; x <= width - 8; x += 8)
   4676             vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x)));
   4677 
   4678         return x;
   4679     }
   4680 };
   4681 
   4682 template <>
   4683 struct Cvt_SIMD<schar, ushort>
   4684 {
   4685     int operator() (const schar * src, ushort * dst, int width) const
   4686     {
   4687         int x = 0;
   4688 
   4689         for ( ; x <= width - 8; x += 8)
   4690         {
   4691             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
   4692             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))),
   4693                                             vqmovun_s32(vmovl_s16(vget_high_s16(v_src)))));
   4694         }
   4695 
   4696         return x;
   4697     }
   4698 };
   4699 
   4700 
   4701 template <>
   4702 struct Cvt_SIMD<schar, int>
   4703 {
   4704     int operator() (const schar * src, int * dst, int width) const
   4705     {
   4706         int x = 0;
   4707 
   4708         for ( ; x <= width - 8; x += 8)
   4709         {
   4710             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
   4711             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
   4712             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
   4713         }
   4714 
   4715         return x;
   4716     }
   4717 };
   4718 
   4719 template <>
   4720 struct Cvt_SIMD<schar, float>
   4721 {
   4722     int operator() (const schar * src, float * dst, int width) const
   4723     {
   4724         int x = 0;
   4725 
   4726         for ( ; x <= width - 8; x += 8)
   4727         {
   4728             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
   4729             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
   4730             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
   4731         }
   4732 
   4733         return x;
   4734     }
   4735 };
   4736 
   4737 // from ushort
   4738 
   4739 template <>
   4740 struct Cvt_SIMD<ushort, uchar>
   4741 {
   4742     int operator() (const ushort * src, uchar * dst, int width) const
   4743     {
   4744         int x = 0;
   4745 
   4746         for ( ; x <= width - 16; x += 16)
   4747         {
   4748             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
   4749             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2)));
   4750         }
   4751 
   4752         return x;
   4753     }
   4754 };
   4755 
   4756 template <>
   4757 struct Cvt_SIMD<ushort, schar>
   4758 {
   4759     int operator() (const ushort * src, schar * dst, int width) const
   4760     {
   4761         int x = 0;
   4762 
   4763         for ( ; x <= width - 16; x += 16)
   4764         {
   4765             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
   4766             int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1)));
   4767             int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1)));
   4768             int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2)));
   4769             int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2)));
   4770 
   4771             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))),
   4772                                           vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21)))));
   4773         }
   4774 
   4775         return x;
   4776     }
   4777 };
   4778 
   4779 template <>
   4780 struct Cvt_SIMD<ushort, short>
   4781 {
   4782     int operator() (const ushort * src, short * dst, int width) const
   4783     {
   4784         int x = 0;
   4785 
   4786         for ( ; x <= width - 8; x += 8)
   4787         {
   4788             uint16x8_t v_src = vld1q_u16(src + x);
   4789             int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)));
   4790             int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)));
   4791 
   4792             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
   4793         }
   4794 
   4795         return x;
   4796     }
   4797 };
   4798 
   4799 template <>
   4800 struct Cvt_SIMD<ushort, int>
   4801 {
   4802     int operator() (const ushort * src, int * dst, int width) const
   4803     {
   4804         int x = 0;
   4805 
   4806         for ( ; x <= width - 8; x += 8)
   4807         {
   4808             uint16x8_t v_src = vld1q_u16(src + x);
   4809             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
   4810             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
   4811         }
   4812 
   4813         return x;
   4814     }
   4815 };
   4816 
   4817 template <>
   4818 struct Cvt_SIMD<ushort, float>
   4819 {
   4820     int operator() (const ushort * src, float * dst, int width) const
   4821     {
   4822         int x = 0;
   4823 
   4824         for ( ; x <= width - 8; x += 8)
   4825         {
   4826             uint16x8_t v_src = vld1q_u16(src + x);
   4827             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
   4828             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
   4829         }
   4830 
   4831         return x;
   4832     }
   4833 };
   4834 
   4835 // from short
   4836 
   4837 template <>
   4838 struct Cvt_SIMD<short, uchar>
   4839 {
   4840     int operator() (const short * src, uchar * dst, int width) const
   4841     {
   4842         int x = 0;
   4843 
   4844         for ( ; x <= width - 16; x += 16)
   4845         {
   4846             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
   4847             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2)));
   4848         }
   4849 
   4850         return x;
   4851     }
   4852 };
   4853 
   4854 template <>
   4855 struct Cvt_SIMD<short, schar>
   4856 {
   4857     int operator() (const short * src, schar * dst, int width) const
   4858     {
   4859         int x = 0;
   4860 
   4861         for ( ; x <= width - 16; x += 16)
   4862         {
   4863             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
   4864             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2)));
   4865         }
   4866 
   4867         return x;
   4868     }
   4869 };
   4870 
   4871 template <>
   4872 struct Cvt_SIMD<short, ushort>
   4873 {
   4874     int operator() (const short * src, ushort * dst, int width) const
   4875     {
   4876         int x = 0;
   4877 
   4878         for ( ; x <= width - 8; x += 8)
   4879         {
   4880             int16x8_t v_src = vld1q_s16(src + x);
   4881             uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src)));
   4882             uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src)));
   4883             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
   4884         }
   4885 
   4886         return x;
   4887     }
   4888 };
   4889 
   4890 template <>
   4891 struct Cvt_SIMD<short, int>
   4892 {
   4893     int operator() (const short * src, int * dst, int width) const
   4894     {
   4895         int x = 0;
   4896 
   4897         for ( ; x <= width - 8; x += 8)
   4898         {
   4899             int16x8_t v_src = vld1q_s16(src + x);
   4900             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
   4901             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
   4902         }
   4903 
   4904         return x;
   4905     }
   4906 };
   4907 
   4908 template <>
   4909 struct Cvt_SIMD<short, float>
   4910 {
   4911     int operator() (const short * src, float * dst, int width) const
   4912     {
   4913         int x = 0;
   4914 
   4915         for ( ; x <= width - 8; x += 8)
   4916         {
   4917             int16x8_t v_src = vld1q_s16(src + x);
   4918             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
   4919             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
   4920         }
   4921 
   4922         return x;
   4923     }
   4924 };
   4925 
   4926 // from int
   4927 
   4928 template <>
   4929 struct Cvt_SIMD<int, uchar>
   4930 {
   4931     int operator() (const int * src, uchar * dst, int width) const
   4932     {
   4933         int x = 0;
   4934 
   4935         for ( ; x <= width - 16; x += 16)
   4936         {
   4937             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
   4938             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
   4939             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
   4940             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4)));
   4941             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
   4942         }
   4943 
   4944         return x;
   4945     }
   4946 };
   4947 
   4948 template <>
   4949 struct Cvt_SIMD<int, schar>
   4950 {
   4951     int operator() (const int * src, schar * dst, int width) const
   4952     {
   4953         int x = 0;
   4954 
   4955         for ( ; x <= width - 16; x += 16)
   4956         {
   4957             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
   4958             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
   4959             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
   4960             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
   4961             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
   4962         }
   4963 
   4964         return x;
   4965     }
   4966 };
   4967 
   4968 
   4969 template <>
   4970 struct Cvt_SIMD<int, ushort>
   4971 {
   4972     int operator() (const int * src, ushort * dst, int width) const
   4973     {
   4974         int x = 0;
   4975 
   4976         for ( ; x <= width - 8; x += 8)
   4977         {
   4978             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
   4979             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
   4980         }
   4981 
   4982         return x;
   4983     }
   4984 };
   4985 
   4986 template <>
   4987 struct Cvt_SIMD<int, short>
   4988 {
   4989     int operator() (const int * src, short * dst, int width) const
   4990     {
   4991         int x = 0;
   4992 
   4993         for ( ; x <= width - 8; x += 8)
   4994         {
   4995             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
   4996             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
   4997         }
   4998 
   4999         return x;
   5000     }
   5001 };
   5002 
   5003 template <>
   5004 struct Cvt_SIMD<int, float>
   5005 {
   5006     int operator() (const int * src, float * dst, int width) const
   5007     {
   5008         int x = 0;
   5009 
   5010         for ( ; x <= width - 4; x += 4)
   5011             vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x)));
   5012 
   5013         return x;
   5014     }
   5015 };
   5016 
   5017 // from float
   5018 
   5019 template <>
   5020 struct Cvt_SIMD<float, uchar>
   5021 {
   5022     int operator() (const float * src, uchar * dst, int width) const
   5023     {
   5024         int x = 0;
   5025 
   5026         for ( ; x <= width - 16; x += 16)
   5027         {
   5028             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
   5029             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
   5030             uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8));
   5031             uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12));
   5032             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
   5033             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4)));
   5034             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
   5035         }
   5036 
   5037         return x;
   5038     }
   5039 };
   5040 
   5041 template <>
   5042 struct Cvt_SIMD<float, schar>
   5043 {
   5044     int operator() (const float * src, schar * dst, int width) const
   5045     {
   5046         int x = 0;
   5047 
   5048         for ( ; x <= width - 16; x += 16)
   5049         {
   5050             int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x));
   5051             int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4));
   5052             int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8));
   5053             int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12));
   5054             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
   5055             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
   5056             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
   5057         }
   5058 
   5059         return x;
   5060     }
   5061 };
   5062 
   5063 
   5064 template <>
   5065 struct Cvt_SIMD<float, ushort>
   5066 {
   5067     int operator() (const float * src, ushort * dst, int width) const
   5068     {
   5069         int x = 0;
   5070 
   5071         for ( ; x <= width - 8; x += 8)
   5072         {
   5073             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
   5074             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
   5075             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
   5076         }
   5077 
   5078         return x;
   5079     }
   5080 };
   5081 
   5082 template <>
   5083 struct Cvt_SIMD<float, int>
   5084 {
   5085     int operator() (const float * src, int * dst, int width) const
   5086     {
   5087         int x = 0;
   5088 
   5089         for ( ; x <= width - 4; x += 4)
   5090             vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x)));
   5091 
   5092         return x;
   5093     }
   5094 };
   5095 
   5096 #endif
   5097 
   5098 template<typename T, typename DT> static void
   5099 cvt_( const T* src, size_t sstep,
   5100       DT* dst, size_t dstep, Size size )
   5101 {
   5102     sstep /= sizeof(src[0]);
   5103     dstep /= sizeof(dst[0]);
   5104     Cvt_SIMD<T, DT> vop;
   5105 
   5106     for( ; size.height--; src += sstep, dst += dstep )
   5107     {
   5108         int x = vop(src, dst, size.width);
   5109         #if CV_ENABLE_UNROLLED
   5110         for( ; x <= size.width - 4; x += 4 )
   5111         {
   5112             DT t0, t1;
   5113             t0 = saturate_cast<DT>(src[x]);
   5114             t1 = saturate_cast<DT>(src[x+1]);
   5115             dst[x] = t0; dst[x+1] = t1;
   5116             t0 = saturate_cast<DT>(src[x+2]);
   5117             t1 = saturate_cast<DT>(src[x+3]);
   5118             dst[x+2] = t0; dst[x+3] = t1;
   5119         }
   5120         #endif
   5121         for( ; x < size.width; x++ )
   5122             dst[x] = saturate_cast<DT>(src[x]);
   5123     }
   5124 }
   5125 
   5126 //vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
   5127 template<>  void
   5128 cvt_<float, short>( const float* src, size_t sstep,
   5129      short* dst, size_t dstep, Size size )
   5130 {
   5131     sstep /= sizeof(src[0]);
   5132     dstep /= sizeof(dst[0]);
   5133 
   5134     for( ; size.height--; src += sstep, dst += dstep )
   5135     {
   5136         int x = 0;
   5137         #if   CV_SSE2
   5138         if(USE_SSE2)
   5139         {
   5140             for( ; x <= size.width - 8; x += 8 )
   5141             {
   5142                 __m128 src128 = _mm_loadu_ps (src + x);
   5143                 __m128i src_int128 = _mm_cvtps_epi32 (src128);
   5144 
   5145                 src128 = _mm_loadu_ps (src + x + 4);
   5146                 __m128i src1_int128 = _mm_cvtps_epi32 (src128);
   5147 
   5148                 src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
   5149                 _mm_storeu_si128((__m128i*)(dst + x),src1_int128);
   5150             }
   5151         }
   5152         #elif CV_NEON
   5153         for( ; x <= size.width - 8; x += 8 )
   5154         {
   5155             float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4);
   5156             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)),
   5157                                            vqmovn_s32(cv_vrndq_s32_f32(v_src2)));
   5158             vst1q_s16(dst + x, v_dst);
   5159         }
   5160         #endif
   5161         for( ; x < size.width; x++ )
   5162             dst[x] = saturate_cast<short>(src[x]);
   5163     }
   5164 
   5165 }
   5166 
   5167 
   5168 template<typename T> static void
   5169 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
   5170 {
   5171     sstep /= sizeof(src[0]);
   5172     dstep /= sizeof(dst[0]);
   5173 
   5174     for( ; size.height--; src += sstep, dst += dstep )
   5175         memcpy(dst, src, size.width*sizeof(src[0]));
   5176 }
   5177 
   5178 #define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \
   5179 static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
   5180                          dtype* dst, size_t dstep, Size size, double* scale) \
   5181 { \
   5182     tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
   5183 }
   5184 
   5185 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
   5186 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
   5187 dtype* dst, size_t dstep, Size size, double* scale) \
   5188 { \
   5189     cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
   5190 }
   5191 
   5192 #if defined(HAVE_IPP)
   5193 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
   5194 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
   5195                          dtype* dst, size_t dstep, Size size, double*) \
   5196 { \
   5197     CV_IPP_CHECK()\
   5198     {\
   5199         if (src && dst)\
   5200         {\
   5201             if (ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0) \
   5202             {\
   5203                 CV_IMPL_ADD(CV_IMPL_IPP)\
   5204                 return; \
   5205             }\
   5206             setIppErrorStatus(); \
   5207         }\
   5208     }\
   5209     cvt_(src, sstep, dst, dstep, size); \
   5210 }
   5211 
   5212 #define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \
   5213 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
   5214                          dtype* dst, size_t dstep, Size size, double*) \
   5215 { \
   5216     CV_IPP_CHECK()\
   5217     {\
   5218         if (src && dst)\
   5219         {\
   5220             if (ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0) \
   5221             {\
   5222                 CV_IMPL_ADD(CV_IMPL_IPP)\
   5223                 return; \
   5224             }\
   5225             setIppErrorStatus(); \
   5226         }\
   5227     }\
   5228     cvt_(src, sstep, dst, dstep, size); \
   5229 }
   5230 #else
   5231 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
   5232 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
   5233                          dtype* dst, size_t dstep, Size size, double*) \
   5234 { \
   5235     cvt_(src, sstep, dst, dstep, size); \
   5236 }
   5237 #define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F
   5238 #endif
   5239 
   5240 #define DEF_CVT_FUNC(suffix, stype, dtype) \
   5241 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
   5242                          dtype* dst, size_t dstep, Size size, double*) \
   5243 { \
   5244     cvt_(src, sstep, dst, dstep, size); \
   5245 }
   5246 
   5247 #define DEF_CPY_FUNC(suffix, stype) \
   5248 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
   5249                          stype* dst, size_t dstep, Size size, double*) \
   5250 { \
   5251     cpy_(src, sstep, dst, dstep, size); \
   5252 }
   5253 
   5254 
   5255 DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float)
   5256 DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float)
   5257 DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float)
   5258 DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float)
   5259 DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float)
   5260 DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float)
   5261 DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float)
   5262 
   5263 DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float)
   5264 DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float)
   5265 DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float)
   5266 DEF_CVT_SCALE_FUNC(16s8u,  short, uchar, float)
   5267 DEF_CVT_SCALE_FUNC(32s8u,  int, uchar, float)
   5268 DEF_CVT_SCALE_FUNC(32f8u,  float, uchar, float)
   5269 DEF_CVT_SCALE_FUNC(64f8u,  double, uchar, float)
   5270 
   5271 DEF_CVT_SCALE_FUNC(8u8s,   uchar, schar, float)
   5272 DEF_CVT_SCALE_FUNC(8s,     schar, schar, float)
   5273 DEF_CVT_SCALE_FUNC(16u8s,  ushort, schar, float)
   5274 DEF_CVT_SCALE_FUNC(16s8s,  short, schar, float)
   5275 DEF_CVT_SCALE_FUNC(32s8s,  int, schar, float)
   5276 DEF_CVT_SCALE_FUNC(32f8s,  float, schar, float)
   5277 DEF_CVT_SCALE_FUNC(64f8s,  double, schar, float)
   5278 
   5279 DEF_CVT_SCALE_FUNC(8u16u,  uchar, ushort, float)
   5280 DEF_CVT_SCALE_FUNC(8s16u,  schar, ushort, float)
   5281 DEF_CVT_SCALE_FUNC(16u,    ushort, ushort, float)
   5282 DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float)
   5283 DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float)
   5284 DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float)
   5285 DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float)
   5286 
   5287 DEF_CVT_SCALE_FUNC(8u16s,  uchar, short, float)
   5288 DEF_CVT_SCALE_FUNC(8s16s,  schar, short, float)
   5289 DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float)
   5290 DEF_CVT_SCALE_FUNC(16s,    short, short, float)
   5291 DEF_CVT_SCALE_FUNC(32s16s, int, short, float)
   5292 DEF_CVT_SCALE_FUNC(32f16s, float, short, float)
   5293 DEF_CVT_SCALE_FUNC(64f16s, double, short, float)
   5294 
   5295 DEF_CVT_SCALE_FUNC(8u32s,  uchar, int, float)
   5296 DEF_CVT_SCALE_FUNC(8s32s,  schar, int, float)
   5297 DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float)
   5298 DEF_CVT_SCALE_FUNC(16s32s, short, int, float)
   5299 DEF_CVT_SCALE_FUNC(32s,    int, int, double)
   5300 DEF_CVT_SCALE_FUNC(32f32s, float, int, float)
   5301 DEF_CVT_SCALE_FUNC(64f32s, double, int, double)
   5302 
   5303 DEF_CVT_SCALE_FUNC(8u32f,  uchar, float, float)
   5304 DEF_CVT_SCALE_FUNC(8s32f,  schar, float, float)
   5305 DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float)
   5306 DEF_CVT_SCALE_FUNC(16s32f, short, float, float)
   5307 DEF_CVT_SCALE_FUNC(32s32f, int, float, double)
   5308 DEF_CVT_SCALE_FUNC(32f,    float, float, float)
   5309 DEF_CVT_SCALE_FUNC(64f32f, double, float, double)
   5310 
   5311 DEF_CVT_SCALE_FUNC(8u64f,  uchar, double, double)
   5312 DEF_CVT_SCALE_FUNC(8s64f,  schar, double, double)
   5313 DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double)
   5314 DEF_CVT_SCALE_FUNC(16s64f, short, double, double)
   5315 DEF_CVT_SCALE_FUNC(32s64f, int, double, double)
   5316 DEF_CVT_SCALE_FUNC(32f64f, float, double, double)
   5317 DEF_CVT_SCALE_FUNC(64f,    double, double, double)
   5318 
   5319 DEF_CPY_FUNC(8u,     uchar)
   5320 DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
   5321 DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
   5322 DEF_CVT_FUNC_F(16s8u,  short, uchar, 16s8u_C1R)
   5323 DEF_CVT_FUNC_F(32s8u,  int, uchar, 32s8u_C1R)
   5324 DEF_CVT_FUNC_F2(32f8u,  float, uchar, 32f8u_C1RSfs)
   5325 DEF_CVT_FUNC(64f8u,  double, uchar)
   5326 
   5327 DEF_CVT_FUNC_F2(8u8s,   uchar, schar, 8u8s_C1RSfs)
   5328 DEF_CVT_FUNC_F2(16u8s,  ushort, schar, 16u8s_C1RSfs)
   5329 DEF_CVT_FUNC_F2(16s8s,  short, schar, 16s8s_C1RSfs)
   5330 DEF_CVT_FUNC_F(32s8s,  int, schar, 32s8s_C1R)
   5331 DEF_CVT_FUNC_F2(32f8s,  float, schar, 32f8s_C1RSfs)
   5332 DEF_CVT_FUNC(64f8s,  double, schar)
   5333 
   5334 DEF_CVT_FUNC_F(8u16u,  uchar, ushort, 8u16u_C1R)
   5335 DEF_CVT_FUNC_F(8s16u,  schar, ushort, 8s16u_C1Rs)
   5336 DEF_CPY_FUNC(16u,    ushort)
   5337 DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs)
   5338 DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs)
   5339 DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs)
   5340 DEF_CVT_FUNC(64f16u, double, ushort)
   5341 
   5342 DEF_CVT_FUNC_F(8u16s,  uchar, short, 8u16s_C1R)
   5343 DEF_CVT_FUNC_F(8s16s,  schar, short, 8s16s_C1R)
   5344 DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs)
   5345 DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs)
   5346 DEF_CVT_FUNC(32f16s, float, short)
   5347 DEF_CVT_FUNC(64f16s, double, short)
   5348 
   5349 DEF_CVT_FUNC_F(8u32s,  uchar, int, 8u32s_C1R)
   5350 DEF_CVT_FUNC_F(8s32s,  schar, int, 8s32s_C1R)
   5351 DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R)
   5352 DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R)
   5353 DEF_CPY_FUNC(32s,    int)
   5354 DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs)
   5355 DEF_CVT_FUNC(64f32s, double, int)
   5356 
   5357 DEF_CVT_FUNC_F(8u32f,  uchar, float, 8u32f_C1R)
   5358 DEF_CVT_FUNC_F(8s32f,  schar, float, 8s32f_C1R)
   5359 DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R)
   5360 DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R)
   5361 DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R)
   5362 DEF_CVT_FUNC(64f32f, double, float)
   5363 
   5364 DEF_CVT_FUNC(8u64f,  uchar, double)
   5365 DEF_CVT_FUNC(8s64f,  schar, double)
   5366 DEF_CVT_FUNC(16u64f, ushort, double)
   5367 DEF_CVT_FUNC(16s64f, short, double)
   5368 DEF_CVT_FUNC(32s64f, int, double)
   5369 DEF_CVT_FUNC(32f64f, float, double)
   5370 DEF_CPY_FUNC(64s,    int64)
   5371 
   5372 static BinaryFunc getCvtScaleAbsFunc(int depth)
   5373 {
   5374     static BinaryFunc cvtScaleAbsTab[] =
   5375     {
   5376         (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u,
   5377         (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u,
   5378         (BinaryFunc)cvtScaleAbs64f8u, 0
   5379     };
   5380 
   5381     return cvtScaleAbsTab[depth];
   5382 }
   5383 
   5384 BinaryFunc getConvertFunc(int sdepth, int ddepth)
   5385 {
   5386     static BinaryFunc cvtTab[][8] =
   5387     {
   5388         {
   5389             (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
   5390             (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
   5391             (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0
   5392         },
   5393         {
   5394             (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
   5395             (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
   5396             (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0
   5397         },
   5398         {
   5399             (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
   5400             (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
   5401             (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0
   5402         },
   5403         {
   5404             (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
   5405             (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
   5406             (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0
   5407         },
   5408         {
   5409             (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
   5410             (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
   5411             (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0
   5412         },
   5413         {
   5414             (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
   5415             (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
   5416             (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0
   5417         },
   5418         {
   5419             (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
   5420             (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
   5421             (BinaryFunc)(cvt64s), 0
   5422         },
   5423         {
   5424             0, 0, 0, 0, 0, 0, 0, 0
   5425         }
   5426     };
   5427 
   5428     return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
   5429 }
   5430 
   5431 static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
   5432 {
   5433     static BinaryFunc cvtScaleTab[][8] =
   5434     {
   5435         {
   5436             (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
   5437             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
   5438             (BinaryFunc)cvtScale64f8u, 0
   5439         },
   5440         {
   5441             (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
   5442             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
   5443             (BinaryFunc)cvtScale64f8s, 0
   5444         },
   5445         {
   5446             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
   5447             (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
   5448             (BinaryFunc)cvtScale64f16u, 0
   5449         },
   5450         {
   5451             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
   5452             (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
   5453             (BinaryFunc)cvtScale64f16s, 0
   5454         },
   5455         {
   5456             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
   5457             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
   5458             (BinaryFunc)cvtScale64f32s, 0
   5459         },
   5460         {
   5461             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
   5462             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
   5463             (BinaryFunc)cvtScale64f32f, 0
   5464         },
   5465         {
   5466             (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
   5467             (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
   5468             (BinaryFunc)cvtScale64f, 0
   5469         },
   5470         {
   5471             0, 0, 0, 0, 0, 0, 0, 0
   5472         }
   5473     };
   5474 
   5475     return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
   5476 }
   5477 
   5478 #ifdef HAVE_OPENCL
   5479 
   5480 static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
   5481 {
   5482     const ocl::Device & d = ocl::Device::getDefault();
   5483 
   5484     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   5485     bool doubleSupport = d.doubleFPConfig() > 0;
   5486     if (!doubleSupport && depth == CV_64F)
   5487         return false;
   5488 
   5489     _dst.create(_src.size(), CV_8UC(cn));
   5490     int kercn = 1;
   5491     if (d.isIntel())
   5492     {
   5493         static const int vectorWidths[] = {4, 4, 4, 4, 4, 4, 4, -1};
   5494         kercn = ocl::checkOptimalVectorWidth( vectorWidths, _src, _dst,
   5495                                               noArray(), noArray(), noArray(),
   5496                                               noArray(), noArray(), noArray(),
   5497                                               noArray(), ocl::OCL_VECTOR_MAX);
   5498     }
   5499     else
   5500         kercn = ocl::predictOptimalVectorWidthMax(_src, _dst);
   5501 
   5502     int rowsPerWI = d.isIntel() ? 4 : 1;
   5503     char cvt[2][50];
   5504     int wdepth = std::max(depth, CV_32F);
   5505     String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
   5506                          " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s"
   5507                          " -D workT1=%s -D rowsPerWI=%d%s",
   5508                          ocl::typeToStr(CV_8UC(kercn)),
   5509                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
   5510                          ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth,
   5511                          ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
   5512                          ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]),
   5513                          ocl::typeToStr(wdepth), rowsPerWI,
   5514                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
   5515     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, build_opt);
   5516     if (k.empty())
   5517         return false;
   5518 
   5519     UMat src = _src.getUMat();
   5520     UMat dst = _dst.getUMat();
   5521 
   5522     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
   5523             dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
   5524 
   5525     if (wdepth == CV_32F)
   5526         k.args(srcarg, dstarg, (float)alpha, (float)beta);
   5527     else if (wdepth == CV_64F)
   5528         k.args(srcarg, dstarg, alpha, beta);
   5529 
   5530     size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
   5531     return k.run(2, globalsize, NULL, false);
   5532 }
   5533 
   5534 #endif
   5535 
   5536 }
   5537 
   5538 void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
   5539 {
   5540     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
   5541                ocl_convertScaleAbs(_src, _dst, alpha, beta))
   5542 
   5543     Mat src = _src.getMat();
   5544     int cn = src.channels();
   5545     double scale[] = {alpha, beta};
   5546     _dst.create( src.dims, src.size, CV_8UC(cn) );
   5547     Mat dst = _dst.getMat();
   5548     BinaryFunc func = getCvtScaleAbsFunc(src.depth());
   5549     CV_Assert( func != 0 );
   5550 
   5551     if( src.dims <= 2 )
   5552     {
   5553         Size sz = getContinuousSize(src, dst, cn);
   5554         func( src.ptr(), src.step, 0, 0, dst.ptr(), dst.step, sz, scale );
   5555     }
   5556     else
   5557     {
   5558         const Mat* arrays[] = {&src, &dst, 0};
   5559         uchar* ptrs[2];
   5560         NAryMatIterator it(arrays, ptrs);
   5561         Size sz((int)it.size*cn, 1);
   5562 
   5563         for( size_t i = 0; i < it.nplanes; i++, ++it )
   5564             func( ptrs[0], 0, 0, 0, ptrs[1], 0, sz, scale );
   5565     }
   5566 }
   5567 
   5568 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
   5569 {
   5570     bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
   5571 
   5572     if( _type < 0 )
   5573         _type = _dst.fixedType() ? _dst.type() : type();
   5574     else
   5575         _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
   5576 
   5577     int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
   5578     if( sdepth == ddepth && noScale )
   5579     {
   5580         copyTo(_dst);
   5581         return;
   5582     }
   5583 
   5584     Mat src = *this;
   5585 
   5586     BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
   5587     double scale[] = {alpha, beta};
   5588     int cn = channels();
   5589     CV_Assert( func != 0 );
   5590 
   5591     if( dims <= 2 )
   5592     {
   5593         _dst.create( size(), _type );
   5594         Mat dst = _dst.getMat();
   5595         Size sz = getContinuousSize(src, dst, cn);
   5596         func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
   5597     }
   5598     else
   5599     {
   5600         _dst.create( dims, size, _type );
   5601         Mat dst = _dst.getMat();
   5602         const Mat* arrays[] = {&src, &dst, 0};
   5603         uchar* ptrs[2];
   5604         NAryMatIterator it(arrays, ptrs);
   5605         Size sz((int)(it.size*cn), 1);
   5606 
   5607         for( size_t i = 0; i < it.nplanes; i++, ++it )
   5608             func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, scale);
   5609     }
   5610 }
   5611 
   5612 /****************************************************************************************\
   5613 *                                    LUT Transform                                       *
   5614 \****************************************************************************************/
   5615 
   5616 namespace cv
   5617 {
   5618 
   5619 template<typename T> static void
   5620 LUT8u_( const uchar* src, const T* lut, T* dst, int len, int cn, int lutcn )
   5621 {
   5622     if( lutcn == 1 )
   5623     {
   5624         for( int i = 0; i < len*cn; i++ )
   5625             dst[i] = lut[src[i]];
   5626     }
   5627     else
   5628     {
   5629         for( int i = 0; i < len*cn; i += cn )
   5630             for( int k = 0; k < cn; k++ )
   5631                 dst[i+k] = lut[src[i+k]*cn+k];
   5632     }
   5633 }
   5634 
   5635 static void LUT8u_8u( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn )
   5636 {
   5637     LUT8u_( src, lut, dst, len, cn, lutcn );
   5638 }
   5639 
   5640 static void LUT8u_8s( const uchar* src, const schar* lut, schar* dst, int len, int cn, int lutcn )
   5641 {
   5642     LUT8u_( src, lut, dst, len, cn, lutcn );
   5643 }
   5644 
   5645 static void LUT8u_16u( const uchar* src, const ushort* lut, ushort* dst, int len, int cn, int lutcn )
   5646 {
   5647     LUT8u_( src, lut, dst, len, cn, lutcn );
   5648 }
   5649 
   5650 static void LUT8u_16s( const uchar* src, const short* lut, short* dst, int len, int cn, int lutcn )
   5651 {
   5652     LUT8u_( src, lut, dst, len, cn, lutcn );
   5653 }
   5654 
   5655 static void LUT8u_32s( const uchar* src, const int* lut, int* dst, int len, int cn, int lutcn )
   5656 {
   5657     LUT8u_( src, lut, dst, len, cn, lutcn );
   5658 }
   5659 
   5660 static void LUT8u_32f( const uchar* src, const float* lut, float* dst, int len, int cn, int lutcn )
   5661 {
   5662     LUT8u_( src, lut, dst, len, cn, lutcn );
   5663 }
   5664 
   5665 static void LUT8u_64f( const uchar* src, const double* lut, double* dst, int len, int cn, int lutcn )
   5666 {
   5667     LUT8u_( src, lut, dst, len, cn, lutcn );
   5668 }
   5669 
   5670 typedef void (*LUTFunc)( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn );
   5671 
   5672 static LUTFunc lutTab[] =
   5673 {
   5674     (LUTFunc)LUT8u_8u, (LUTFunc)LUT8u_8s, (LUTFunc)LUT8u_16u, (LUTFunc)LUT8u_16s,
   5675     (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, 0
   5676 };
   5677 
   5678 #ifdef HAVE_OPENCL
   5679 
   5680 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
   5681 {
   5682     int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth();
   5683 
   5684     UMat src = _src.getUMat(), lut = _lut.getUMat();
   5685     _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
   5686     UMat dst = _dst.getUMat();
   5687     int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_src, _dst)) : dcn;
   5688 
   5689     ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
   5690                   format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn,
   5691                          ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
   5692     if (k.empty())
   5693         return false;
   5694 
   5695     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
   5696         ocl::KernelArg::WriteOnly(dst, dcn, kercn));
   5697 
   5698     size_t globalSize[2] = { dst.cols * dcn / kercn, (dst.rows + 3) / 4 };
   5699     return k.run(2, globalSize, NULL, false);
   5700 }
   5701 
   5702 #endif
   5703 
   5704 #if defined(HAVE_IPP)
   5705 namespace ipp {
   5706 
   5707 #if 0 // there are no performance benefits (PR #2653)
   5708 class IppLUTParallelBody_LUTC1 : public ParallelLoopBody
   5709 {
   5710 public:
   5711     bool* ok;
   5712     const Mat& src_;
   5713     const Mat& lut_;
   5714     Mat& dst_;
   5715 
   5716     typedef IppStatus (*IppFn)(const Ipp8u* pSrc, int srcStep, void* pDst, int dstStep,
   5717                           IppiSize roiSize, const void* pTable, int nBitSize);
   5718     IppFn fn;
   5719 
   5720     int width;
   5721 
   5722     IppLUTParallelBody_LUTC1(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
   5723         : ok(_ok), src_(src), lut_(lut), dst_(dst)
   5724     {
   5725         width = dst.cols * dst.channels();
   5726 
   5727         size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
   5728 
   5729         fn =
   5730                 elemSize1 == 1 ? (IppFn)ippiLUTPalette_8u_C1R :
   5731                 elemSize1 == 4 ? (IppFn)ippiLUTPalette_8u32u_C1R :
   5732                 NULL;
   5733 
   5734         *ok = (fn != NULL);
   5735     }
   5736 
   5737     void operator()( const cv::Range& range ) const
   5738     {
   5739         if (!*ok)
   5740             return;
   5741 
   5742         const int row0 = range.start;
   5743         const int row1 = range.end;
   5744 
   5745         Mat src = src_.rowRange(row0, row1);
   5746         Mat dst = dst_.rowRange(row0, row1);
   5747 
   5748         IppiSize sz = { width, dst.rows };
   5749 
   5750         CV_DbgAssert(fn != NULL);
   5751         if (fn(src.data, (int)src.step[0], dst.data, (int)dst.step[0], sz, lut_.data, 8) < 0)
   5752         {
   5753             setIppErrorStatus();
   5754             *ok = false;
   5755         }
   5756         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   5757     }
   5758 private:
   5759     IppLUTParallelBody_LUTC1(const IppLUTParallelBody_LUTC1&);
   5760     IppLUTParallelBody_LUTC1& operator=(const IppLUTParallelBody_LUTC1&);
   5761 };
   5762 #endif
   5763 
   5764 class IppLUTParallelBody_LUTCN : public ParallelLoopBody
   5765 {
   5766 public:
   5767     bool *ok;
   5768     const Mat& src_;
   5769     const Mat& lut_;
   5770     Mat& dst_;
   5771 
   5772     int lutcn;
   5773 
   5774     uchar* lutBuffer;
   5775     uchar* lutTable[4];
   5776 
   5777     IppLUTParallelBody_LUTCN(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
   5778         : ok(_ok), src_(src), lut_(lut), dst_(dst), lutBuffer(NULL)
   5779     {
   5780         lutcn = lut.channels();
   5781         IppiSize sz256 = {256, 1};
   5782 
   5783         size_t elemSize1 = dst.elemSize1();
   5784         CV_DbgAssert(elemSize1 == 1);
   5785         lutBuffer = (uchar*)ippMalloc(256 * (int)elemSize1 * 4);
   5786         lutTable[0] = lutBuffer + 0;
   5787         lutTable[1] = lutBuffer + 1 * 256 * elemSize1;
   5788         lutTable[2] = lutBuffer + 2 * 256 * elemSize1;
   5789         lutTable[3] = lutBuffer + 3 * 256 * elemSize1;
   5790 
   5791         CV_DbgAssert(lutcn == 3 || lutcn == 4);
   5792         if (lutcn == 3)
   5793         {
   5794             IppStatus status = ippiCopy_8u_C3P3R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
   5795             if (status < 0)
   5796             {
   5797                 setIppErrorStatus();
   5798                 return;
   5799             }
   5800             CV_IMPL_ADD(CV_IMPL_IPP);
   5801         }
   5802         else if (lutcn == 4)
   5803         {
   5804             IppStatus status = ippiCopy_8u_C4P4R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
   5805             if (status < 0)
   5806             {
   5807                 setIppErrorStatus();
   5808                 return;
   5809             }
   5810             CV_IMPL_ADD(CV_IMPL_IPP);
   5811         }
   5812 
   5813         *ok = true;
   5814     }
   5815 
   5816     ~IppLUTParallelBody_LUTCN()
   5817     {
   5818         if (lutBuffer != NULL)
   5819             ippFree(lutBuffer);
   5820         lutBuffer = NULL;
   5821         lutTable[0] = NULL;
   5822     }
   5823 
   5824     void operator()( const cv::Range& range ) const
   5825     {
   5826         if (!*ok)
   5827             return;
   5828 
   5829         const int row0 = range.start;
   5830         const int row1 = range.end;
   5831 
   5832         Mat src = src_.rowRange(row0, row1);
   5833         Mat dst = dst_.rowRange(row0, row1);
   5834 
   5835         if (lutcn == 3)
   5836         {
   5837             if (ippiLUTPalette_8u_C3R(
   5838                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
   5839                     ippiSize(dst.size()), lutTable, 8) >= 0)
   5840             {
   5841                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   5842                 return;
   5843             }
   5844         }
   5845         else if (lutcn == 4)
   5846         {
   5847             if (ippiLUTPalette_8u_C4R(
   5848                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
   5849                     ippiSize(dst.size()), lutTable, 8) >= 0)
   5850             {
   5851                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   5852                 return;
   5853             }
   5854         }
   5855         setIppErrorStatus();
   5856         *ok = false;
   5857     }
   5858 private:
   5859     IppLUTParallelBody_LUTCN(const IppLUTParallelBody_LUTCN&);
   5860     IppLUTParallelBody_LUTCN& operator=(const IppLUTParallelBody_LUTCN&);
   5861 };
   5862 } // namespace ipp
   5863 #endif // IPP
   5864 
   5865 class LUTParallelBody : public ParallelLoopBody
   5866 {
   5867 public:
   5868     bool* ok;
   5869     const Mat& src_;
   5870     const Mat& lut_;
   5871     Mat& dst_;
   5872 
   5873     LUTFunc func;
   5874 
   5875     LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
   5876         : ok(_ok), src_(src), lut_(lut), dst_(dst)
   5877     {
   5878         func = lutTab[lut.depth()];
   5879         *ok = (func != NULL);
   5880     }
   5881 
   5882     void operator()( const cv::Range& range ) const
   5883     {
   5884         CV_DbgAssert(*ok);
   5885 
   5886         const int row0 = range.start;
   5887         const int row1 = range.end;
   5888 
   5889         Mat src = src_.rowRange(row0, row1);
   5890         Mat dst = dst_.rowRange(row0, row1);
   5891 
   5892         int cn = src.channels();
   5893         int lutcn = lut_.channels();
   5894 
   5895         const Mat* arrays[] = {&src, &dst, 0};
   5896         uchar* ptrs[2];
   5897         NAryMatIterator it(arrays, ptrs);
   5898         int len = (int)it.size;
   5899 
   5900         for( size_t i = 0; i < it.nplanes; i++, ++it )
   5901             func(ptrs[0], lut_.ptr(), ptrs[1], len, cn, lutcn);
   5902     }
   5903 private:
   5904     LUTParallelBody(const LUTParallelBody&);
   5905     LUTParallelBody& operator=(const LUTParallelBody&);
   5906 };
   5907 
   5908 }
   5909 
   5910 void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
   5911 {
   5912     int cn = _src.channels(), depth = _src.depth();
   5913     int lutcn = _lut.channels();
   5914 
   5915     CV_Assert( (lutcn == cn || lutcn == 1) &&
   5916         _lut.total() == 256 && _lut.isContinuous() &&
   5917         (depth == CV_8U || depth == CV_8S) );
   5918 
   5919     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
   5920                ocl_LUT(_src, _lut, _dst))
   5921 
   5922     Mat src = _src.getMat(), lut = _lut.getMat();
   5923     _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn));
   5924     Mat dst = _dst.getMat();
   5925 
   5926     if (_src.dims() <= 2)
   5927     {
   5928         bool ok = false;
   5929         Ptr<ParallelLoopBody> body;
   5930 #if defined(HAVE_IPP)
   5931         CV_IPP_CHECK()
   5932         {
   5933             size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
   5934 #if 0 // there are no performance benefits (PR #2653)
   5935             if (lutcn == 1)
   5936             {
   5937                 ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTC1(src, lut, dst, &ok);
   5938                 body.reset(p);
   5939             }
   5940             else
   5941 #endif
   5942             if ((lutcn == 3 || lutcn == 4) && elemSize1 == 1)
   5943             {
   5944                 ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTCN(src, lut, dst, &ok);
   5945                 body.reset(p);
   5946             }
   5947         }
   5948 #endif
   5949         if (body == NULL || ok == false)
   5950         {
   5951             ok = false;
   5952             ParallelLoopBody* p = new LUTParallelBody(src, lut, dst, &ok);
   5953             body.reset(p);
   5954         }
   5955         if (body != NULL && ok)
   5956         {
   5957             Range all(0, dst.rows);
   5958             if (dst.total()>>18)
   5959                 parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16));
   5960             else
   5961                 (*body)(all);
   5962             if (ok)
   5963                 return;
   5964         }
   5965     }
   5966 
   5967     LUTFunc func = lutTab[lut.depth()];
   5968     CV_Assert( func != 0 );
   5969 
   5970     const Mat* arrays[] = {&src, &dst, 0};
   5971     uchar* ptrs[2];
   5972     NAryMatIterator it(arrays, ptrs);
   5973     int len = (int)it.size;
   5974 
   5975     for( size_t i = 0; i < it.nplanes; i++, ++it )
   5976         func(ptrs[0], lut.ptr(), ptrs[1], len, cn, lutcn);
   5977 }
   5978 
   5979 namespace cv {
   5980 
   5981 #ifdef HAVE_OPENCL
   5982 
   5983 static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
   5984                            double scale, double delta )
   5985 {
   5986     UMat src = _src.getUMat();
   5987 
   5988     if( _mask.empty() )
   5989         src.convertTo( _dst, dtype, scale, delta );
   5990     else if (src.channels() <= 4)
   5991     {
   5992         const ocl::Device & dev = ocl::Device::getDefault();
   5993 
   5994         int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
   5995                 ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
   5996                 rowsPerWI = dev.isIntel() ? 4 : 1;
   5997 
   5998         float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
   5999         bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
   6000                 haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
   6001                 haveDelta = std::fabs(delta) > DBL_EPSILON,
   6002                 doubleSupport = dev.doubleFPConfig() > 0;
   6003 
   6004         if (!haveScale && !haveDelta && stype == dtype)
   6005         {
   6006             _src.copyTo(_dst, _mask);
   6007             return true;
   6008         }
   6009         if (haveZeroScale)
   6010         {
   6011             _dst.setTo(Scalar(delta), _mask);
   6012             return true;
   6013         }
   6014 
   6015         if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
   6016             return false;
   6017 
   6018         char cvt[2][40];
   6019         String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
   6020                              " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
   6021                              ocl::typeToStr(stype), ocl::typeToStr(dtype),
   6022                              ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
   6023                              rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
   6024                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
   6025                              doubleSupport ? " -D DOUBLE_SUPPORT" : "",
   6026                              haveScale ? " -D HAVE_SCALE" : "",
   6027                              haveDelta ? " -D HAVE_DELTA" : "",
   6028                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
   6029 
   6030         ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
   6031         if (k.empty())
   6032             return false;
   6033 
   6034         UMat mask = _mask.getUMat(), dst = _dst.getUMat();
   6035 
   6036         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
   6037                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
   6038                 dstarg = ocl::KernelArg::ReadWrite(dst);
   6039 
   6040         if (haveScale)
   6041         {
   6042             if (haveDelta)
   6043                 k.args(srcarg, maskarg, dstarg, fscale, fdelta);
   6044             else
   6045                 k.args(srcarg, maskarg, dstarg, fscale);
   6046         }
   6047         else
   6048         {
   6049             if (haveDelta)
   6050                 k.args(srcarg, maskarg, dstarg, fdelta);
   6051             else
   6052                 k.args(srcarg, maskarg, dstarg);
   6053         }
   6054 
   6055         size_t globalsize[2] = { src.cols, (src.rows + rowsPerWI - 1) / rowsPerWI };
   6056         return k.run(2, globalsize, NULL, false);
   6057     }
   6058     else
   6059     {
   6060         UMat temp;
   6061         src.convertTo( temp, dtype, scale, delta );
   6062         temp.copyTo( _dst, _mask );
   6063     }
   6064 
   6065     return true;
   6066 }
   6067 
   6068 #endif
   6069 
   6070 }
   6071 
   6072 void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b,
   6073                     int norm_type, int rtype, InputArray _mask )
   6074 {
   6075     double scale = 1, shift = 0;
   6076     if( norm_type == CV_MINMAX )
   6077     {
   6078         double smin = 0, smax = 0;
   6079         double dmin = MIN( a, b ), dmax = MAX( a, b );
   6080         minMaxLoc( _src, &smin, &smax, 0, 0, _mask );
   6081         scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
   6082         shift = dmin - smin*scale;
   6083     }
   6084     else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
   6085     {
   6086         scale = norm( _src, norm_type, _mask );
   6087         scale = scale > DBL_EPSILON ? a/scale : 0.;
   6088         shift = 0;
   6089     }
   6090     else
   6091         CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
   6092 
   6093     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   6094     if( rtype < 0 )
   6095         rtype = _dst.fixedType() ? _dst.depth() : depth;
   6096     _dst.createSameSize(_src, CV_MAKETYPE(rtype, cn));
   6097 
   6098     CV_OCL_RUN(_dst.isUMat(),
   6099                ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
   6100 
   6101     Mat src = _src.getMat(), dst = _dst.getMat();
   6102     if( _mask.empty() )
   6103         src.convertTo( dst, rtype, scale, shift );
   6104     else
   6105     {
   6106         Mat temp;
   6107         src.convertTo( temp, rtype, scale, shift );
   6108         temp.copyTo( dst, _mask );
   6109     }
   6110 }
   6111 
   6112 CV_IMPL void
   6113 cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 )
   6114 {
   6115     void* dptrs[] = { dstarr0, dstarr1, dstarr2, dstarr3 };
   6116     cv::Mat src = cv::cvarrToMat(srcarr);
   6117     int i, j, nz = 0;
   6118     for( i = 0; i < 4; i++ )
   6119         nz += dptrs[i] != 0;
   6120     CV_Assert( nz > 0 );
   6121     std::vector<cv::Mat> dvec(nz);
   6122     std::vector<int> pairs(nz*2);
   6123 
   6124     for( i = j = 0; i < 4; i++ )
   6125     {
   6126         if( dptrs[i] != 0 )
   6127         {
   6128             dvec[j] = cv::cvarrToMat(dptrs[i]);
   6129             CV_Assert( dvec[j].size() == src.size() );
   6130             CV_Assert( dvec[j].depth() == src.depth() );
   6131             CV_Assert( dvec[j].channels() == 1 );
   6132             CV_Assert( i < src.channels() );
   6133             pairs[j*2] = i;
   6134             pairs[j*2+1] = j;
   6135             j++;
   6136         }
   6137     }
   6138     if( nz == src.channels() )
   6139         cv::split( src, dvec );
   6140     else
   6141     {
   6142         cv::mixChannels( &src, 1, &dvec[0], nz, &pairs[0], nz );
   6143     }
   6144 }
   6145 
   6146 
   6147 CV_IMPL void
   6148 cvMerge( const void* srcarr0, const void* srcarr1, const void* srcarr2,
   6149          const void* srcarr3, void* dstarr )
   6150 {
   6151     const void* sptrs[] = { srcarr0, srcarr1, srcarr2, srcarr3 };
   6152     cv::Mat dst = cv::cvarrToMat(dstarr);
   6153     int i, j, nz = 0;
   6154     for( i = 0; i < 4; i++ )
   6155         nz += sptrs[i] != 0;
   6156     CV_Assert( nz > 0 );
   6157     std::vector<cv::Mat> svec(nz);
   6158     std::vector<int> pairs(nz*2);
   6159 
   6160     for( i = j = 0; i < 4; i++ )
   6161     {
   6162         if( sptrs[i] != 0 )
   6163         {
   6164             svec[j] = cv::cvarrToMat(sptrs[i]);
   6165             CV_Assert( svec[j].size == dst.size &&
   6166                 svec[j].depth() == dst.depth() &&
   6167                 svec[j].channels() == 1 && i < dst.channels() );
   6168             pairs[j*2] = j;
   6169             pairs[j*2+1] = i;
   6170             j++;
   6171         }
   6172     }
   6173 
   6174     if( nz == dst.channels() )
   6175         cv::merge( svec, dst );
   6176     else
   6177     {
   6178         cv::mixChannels( &svec[0], nz, &dst, 1, &pairs[0], nz );
   6179     }
   6180 }
   6181 
   6182 
   6183 CV_IMPL void
   6184 cvMixChannels( const CvArr** src, int src_count,
   6185                CvArr** dst, int dst_count,
   6186                const int* from_to, int pair_count )
   6187 {
   6188     cv::AutoBuffer<cv::Mat> buf(src_count + dst_count);
   6189 
   6190     int i;
   6191     for( i = 0; i < src_count; i++ )
   6192         buf[i] = cv::cvarrToMat(src[i]);
   6193     for( i = 0; i < dst_count; i++ )
   6194         buf[i+src_count] = cv::cvarrToMat(dst[i]);
   6195     cv::mixChannels(&buf[0], src_count, &buf[src_count], dst_count, from_to, pair_count);
   6196 }
   6197 
   6198 CV_IMPL void
   6199 cvConvertScaleAbs( const void* srcarr, void* dstarr,
   6200                    double scale, double shift )
   6201 {
   6202     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
   6203     CV_Assert( src.size == dst.size && dst.type() == CV_8UC(src.channels()));
   6204     cv::convertScaleAbs( src, dst, scale, shift );
   6205 }
   6206 
   6207 CV_IMPL void
   6208 cvConvertScale( const void* srcarr, void* dstarr,
   6209                 double scale, double shift )
   6210 {
   6211     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
   6212 
   6213     CV_Assert( src.size == dst.size && src.channels() == dst.channels() );
   6214     src.convertTo(dst, dst.type(), scale, shift);
   6215 }
   6216 
   6217 CV_IMPL void cvLUT( const void* srcarr, void* dstarr, const void* lutarr )
   6218 {
   6219     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), lut = cv::cvarrToMat(lutarr);
   6220 
   6221     CV_Assert( dst.size() == src.size() && dst.type() == CV_MAKETYPE(lut.depth(), src.channels()) );
   6222     cv::LUT( src, lut, dst );
   6223 }
   6224 
   6225 CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr,
   6226                           double a, double b, int norm_type, const CvArr* maskarr )
   6227 {
   6228     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
   6229     if( maskarr )
   6230         mask = cv::cvarrToMat(maskarr);
   6231     CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() );
   6232     cv::normalize( src, dst, a, b, norm_type, dst.type(), mask );
   6233 }
   6234 
   6235 /* End of file. */
   6236