Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2014, Itseez Inc., all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 /
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 #include "precomp.hpp"
     45 #include "opencl_kernels_imgproc.hpp"
     46 
     47 namespace cv
     48 {
     49 
     50 template <typename T, typename AT>
     51 struct Acc_SIMD
     52 {
     53     int operator() (const T *, AT *, const uchar *, int, int) const
     54     {
     55         return 0;
     56     }
     57 };
     58 
     59 template <typename T, typename AT>
     60 struct AccSqr_SIMD
     61 {
     62     int operator() (const T *, AT *, const uchar *, int, int) const
     63     {
     64         return 0;
     65     }
     66 };
     67 
     68 template <typename T, typename AT>
     69 struct AccProd_SIMD
     70 {
     71     int operator() (const T *, const T *, AT *, const uchar *, int, int) const
     72     {
     73         return 0;
     74     }
     75 };
     76 
     77 template <typename T, typename AT>
     78 struct AccW_SIMD
     79 {
     80     int operator() (const T *, AT *, const uchar *, int, int, AT) const
     81     {
     82         return 0;
     83     }
     84 };
     85 
     86 #if CV_NEON
     87 
     88 template <>
     89 struct Acc_SIMD<uchar, float>
     90 {
     91     int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
     92     {
     93         int x = 0;
     94 
     95         if (!mask)
     96         {
     97             len *= cn;
     98             for ( ; x <= len - 16; x += 16)
     99             {
    100                 uint8x16_t v_src = vld1q_u8(src + x);
    101                 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
    102 
    103                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
    104                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
    105                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
    106                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
    107             }
    108         }
    109         else if (cn == 1)
    110         {
    111             uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
    112 
    113             for ( ; x <= len - 16; x += 16)
    114             {
    115                 uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
    116                 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
    117 
    118                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
    119                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
    120                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
    121                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
    122             }
    123         }
    124 
    125         return x;
    126     }
    127 };
    128 
    129 template <>
    130 struct Acc_SIMD<ushort, float>
    131 {
    132     int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
    133     {
    134         int x = 0;
    135 
    136         if (!mask)
    137         {
    138             len *= cn;
    139             for ( ; x <= len - 8; x += 8)
    140             {
    141                 uint16x8_t v_src = vld1q_u16(src + x);
    142                 uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src));
    143 
    144                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
    145                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
    146             }
    147         }
    148 
    149         return x;
    150     }
    151 };
    152 
    153 template <>
    154 struct Acc_SIMD<float, float>
    155 {
    156     int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
    157     {
    158         int x = 0;
    159 
    160         if (!mask)
    161         {
    162             len *= cn;
    163             for ( ; x <= len - 8; x += 8)
    164             {
    165                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vld1q_f32(src + x)));
    166                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src + x + 4)));
    167             }
    168         }
    169 
    170         return x;
    171     }
    172 };
    173 
    174 template <>
    175 struct AccSqr_SIMD<uchar, float>
    176 {
    177     int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
    178     {
    179         int x = 0;
    180 
    181         if (!mask)
    182         {
    183             len *= cn;
    184             for ( ; x <= len - 16; x += 16)
    185             {
    186                 uint8x16_t v_src = vld1q_u8(src + x);
    187                 uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src);
    188                 uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1);
    189 
    190                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
    191                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
    192                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
    193                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
    194             }
    195         }
    196         else if (cn == 1)
    197         {
    198             uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
    199 
    200             for ( ; x <= len - 16; x += 16)
    201             {
    202                 uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
    203                 uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src);
    204                 uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1);
    205 
    206                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
    207                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
    208                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
    209                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
    210             }
    211         }
    212 
    213         return x;
    214     }
    215 };
    216 
    217 template <>
    218 struct AccSqr_SIMD<ushort, float>
    219 {
    220     int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
    221     {
    222         int x = 0;
    223 
    224         if (!mask)
    225         {
    226             len *= cn;
    227             for ( ; x <= len - 8; x += 8)
    228             {
    229                 uint16x8_t v_src = vld1q_u16(src + x);
    230                 uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src);
    231                 uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1);
    232 
    233                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
    234                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
    235             }
    236         }
    237         else if (cn == 1)
    238         {
    239             uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0);
    240 
    241             for ( ; x <= len - 8; x += 8)
    242             {
    243                 uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0));
    244                 uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src);
    245                 uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])),
    246                            v_src = vandq_u16(vld1q_u16(src + x), v_mask);
    247 
    248                 uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src);
    249                 uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1);
    250 
    251                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
    252                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
    253             }
    254         }
    255 
    256         return x;
    257     }
    258 };
    259 
    260 template <>
    261 struct AccSqr_SIMD<float, float>
    262 {
    263     int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
    264     {
    265         int x = 0;
    266 
    267         if (!mask)
    268         {
    269             len *= cn;
    270             for ( ; x <= len - 8; x += 8)
    271             {
    272                 float32x4_t v_src = vld1q_f32(src + x);
    273                 vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), v_src, v_src));
    274 
    275                 v_src = vld1q_f32(src + x + 4);
    276                 vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), v_src, v_src));
    277             }
    278         }
    279 
    280         return x;
    281     }
    282 };
    283 
    284 template <>
    285 struct AccProd_SIMD<uchar, float>
    286 {
    287     int operator() (const uchar * src1, const uchar * src2, float * dst, const uchar * mask, int len, int cn) const
    288     {
    289         int x = 0;
    290 
    291         if (!mask)
    292         {
    293             len *= cn;
    294             for ( ; x <= len - 16; x += 16)
    295             {
    296                 uint8x16_t v_1src = vld1q_u8(src1 + x), v_2src = vld1q_u8(src2 + x);
    297                 uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)),
    298                            v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src));
    299 
    300                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
    301                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
    302                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
    303                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
    304             }
    305         }
    306         else if (cn == 1)
    307         {
    308             uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
    309 
    310             for ( ; x <= len - 16; x += 16)
    311             {
    312                 uint8x16_t v_mask = veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0));
    313                 uint8x16_t v_1src = vandq_u8(vld1q_u8(src1 + x), v_mask), v_2src = vandq_u8(vld1q_u8(src2 + x), v_mask);
    314                 uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)),
    315                            v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src));
    316 
    317                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
    318                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
    319                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
    320                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
    321             }
    322         }
    323 
    324         return x;
    325     }
    326 };
    327 
    328 template <>
    329 struct AccProd_SIMD<ushort, float>
    330 {
    331     int operator() (const ushort * src1, const ushort * src2, float * dst, const uchar * mask, int len, int cn) const
    332     {
    333         int x = 0;
    334 
    335         if (!mask)
    336         {
    337             len *= cn;
    338             for ( ; x <= len - 8; x += 8)
    339             {
    340                 uint16x8_t v_1src = vld1q_u16(src1 + x), v_2src = vld1q_u16(src2 + x);
    341                 uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)),
    342                            v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src));
    343 
    344                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
    345                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
    346             }
    347         }
    348         else if (cn == 1)
    349         {
    350             uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0);
    351 
    352             for ( ; x <= len - 8; x += 8)
    353             {
    354                 uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0));
    355                 uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src);
    356                 uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])),
    357                            v_1src = vandq_u16(vld1q_u16(src1 + x), v_mask),
    358                            v_2src = vandq_u16(vld1q_u16(src2 + x), v_mask);
    359 
    360                 uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)),
    361                            v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src));
    362 
    363                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
    364                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
    365             }
    366         }
    367 
    368         return x;
    369     }
    370 };
    371 
    372 template <>
    373 struct AccProd_SIMD<float, float>
    374 {
    375     int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const
    376     {
    377         int x = 0;
    378 
    379         if (!mask)
    380         {
    381             len *= cn;
    382             for ( ; x <= len - 8; x += 8)
    383             {
    384                 vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), vld1q_f32(src1 + x), vld1q_f32(src2 + x)));
    385                 vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)));
    386             }
    387         }
    388 
    389         return x;
    390     }
    391 };
    392 
    393 template <>
    394 struct AccW_SIMD<uchar, float>
    395 {
    396     int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
    397     {
    398         int x = 0;
    399         float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha);
    400 
    401         if (!mask)
    402         {
    403             len *= cn;
    404             for ( ; x <= len - 16; x += 16)
    405             {
    406                 uint8x16_t v_src = vld1q_u8(src + x);
    407                 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
    408 
    409                 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta),
    410                                              vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), v_alpha));
    411                 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta),
    412                                              vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), v_alpha));
    413                 vst1q_f32(dst + x + 8, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 8), v_beta),
    414                                                  vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_alpha));
    415                 vst1q_f32(dst + x + 12, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 12), v_beta),
    416                                                   vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_alpha));
    417             }
    418         }
    419 
    420         return x;
    421     }
    422 };
    423 
    424 template <>
    425 struct AccW_SIMD<ushort, float>
    426 {
    427     int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
    428     {
    429         int x = 0;
    430         float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha);
    431 
    432         if (!mask)
    433         {
    434             len *= cn;
    435             for ( ; x <= len - 8; x += 8)
    436             {
    437                 uint16x8_t v_src = vld1q_u16(src + x);
    438                 uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src));
    439 
    440                 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vcvtq_f32_u32(v_src0), v_alpha));
    441                 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vcvtq_f32_u32(v_src1), v_alpha));
    442             }
    443         }
    444 
    445         return x;
    446     }
    447 };
    448 
    449 template <>
    450 struct AccW_SIMD<float, float>
    451 {
    452     int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
    453     {
    454         int x = 0;
    455         float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha);
    456 
    457         if (!mask)
    458         {
    459             len *= cn;
    460             for ( ; x <= len - 8; x += 8)
    461             {
    462                 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vld1q_f32(src + x), v_alpha));
    463                 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vld1q_f32(src + x + 4), v_alpha));
    464             }
    465         }
    466 
    467         return x;
    468     }
    469 };
    470 
    471 #endif
    472 
    473 template<typename T, typename AT> void
    474 acc_( const T* src, AT* dst, const uchar* mask, int len, int cn )
    475 {
    476     int i = Acc_SIMD<T, AT>()(src, dst, mask, len, cn);
    477 
    478     if( !mask )
    479     {
    480         len *= cn;
    481         #if CV_ENABLE_UNROLLED
    482         for( ; i <= len - 4; i += 4 )
    483         {
    484             AT t0, t1;
    485             t0 = src[i] + dst[i];
    486             t1 = src[i+1] + dst[i+1];
    487             dst[i] = t0; dst[i+1] = t1;
    488 
    489             t0 = src[i+2] + dst[i+2];
    490             t1 = src[i+3] + dst[i+3];
    491             dst[i+2] = t0; dst[i+3] = t1;
    492         }
    493         #endif
    494         for( ; i < len; i++ )
    495             dst[i] += src[i];
    496     }
    497     else if( cn == 1 )
    498     {
    499         for( ; i < len; i++ )
    500         {
    501             if( mask[i] )
    502                 dst[i] += src[i];
    503         }
    504     }
    505     else if( cn == 3 )
    506     {
    507         for( ; i < len; i++, src += 3, dst += 3 )
    508         {
    509             if( mask[i] )
    510             {
    511                 AT t0 = src[0] + dst[0];
    512                 AT t1 = src[1] + dst[1];
    513                 AT t2 = src[2] + dst[2];
    514 
    515                 dst[0] = t0; dst[1] = t1; dst[2] = t2;
    516             }
    517         }
    518     }
    519     else
    520     {
    521         for( ; i < len; i++, src += cn, dst += cn )
    522             if( mask[i] )
    523             {
    524                 for( int k = 0; k < cn; k++ )
    525                     dst[k] += src[k];
    526             }
    527     }
    528 }
    529 
    530 
    531 template<typename T, typename AT> void
    532 accSqr_( const T* src, AT* dst, const uchar* mask, int len, int cn )
    533 {
    534     int i = AccSqr_SIMD<T, AT>()(src, dst, mask, len, cn);
    535 
    536     if( !mask )
    537     {
    538         len *= cn;
    539          #if CV_ENABLE_UNROLLED
    540         for( ; i <= len - 4; i += 4 )
    541         {
    542             AT t0, t1;
    543             t0 = (AT)src[i]*src[i] + dst[i];
    544             t1 = (AT)src[i+1]*src[i+1] + dst[i+1];
    545             dst[i] = t0; dst[i+1] = t1;
    546 
    547             t0 = (AT)src[i+2]*src[i+2] + dst[i+2];
    548             t1 = (AT)src[i+3]*src[i+3] + dst[i+3];
    549             dst[i+2] = t0; dst[i+3] = t1;
    550         }
    551         #endif
    552         for( ; i < len; i++ )
    553             dst[i] += (AT)src[i]*src[i];
    554     }
    555     else if( cn == 1 )
    556     {
    557         for( ; i < len; i++ )
    558         {
    559             if( mask[i] )
    560                 dst[i] += (AT)src[i]*src[i];
    561         }
    562     }
    563     else if( cn == 3 )
    564     {
    565         for( ; i < len; i++, src += 3, dst += 3 )
    566         {
    567             if( mask[i] )
    568             {
    569                 AT t0 = (AT)src[0]*src[0] + dst[0];
    570                 AT t1 = (AT)src[1]*src[1] + dst[1];
    571                 AT t2 = (AT)src[2]*src[2] + dst[2];
    572 
    573                 dst[0] = t0; dst[1] = t1; dst[2] = t2;
    574             }
    575         }
    576     }
    577     else
    578     {
    579         for( ; i < len; i++, src += cn, dst += cn )
    580             if( mask[i] )
    581             {
    582                 for( int k = 0; k < cn; k++ )
    583                     dst[k] += (AT)src[k]*src[k];
    584             }
    585     }
    586 }
    587 
    588 
    589 template<typename T, typename AT> void
    590 accProd_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn )
    591 {
    592     int i = AccProd_SIMD<T, AT>()(src1, src2, dst, mask, len, cn);
    593 
    594     if( !mask )
    595     {
    596         len *= cn;
    597         #if CV_ENABLE_UNROLLED
    598         for( ; i <= len - 4; i += 4 )
    599         {
    600             AT t0, t1;
    601             t0 = (AT)src1[i]*src2[i] + dst[i];
    602             t1 = (AT)src1[i+1]*src2[i+1] + dst[i+1];
    603             dst[i] = t0; dst[i+1] = t1;
    604 
    605             t0 = (AT)src1[i+2]*src2[i+2] + dst[i+2];
    606             t1 = (AT)src1[i+3]*src2[i+3] + dst[i+3];
    607             dst[i+2] = t0; dst[i+3] = t1;
    608         }
    609         #endif
    610         for( ; i < len; i++ )
    611             dst[i] += (AT)src1[i]*src2[i];
    612     }
    613     else if( cn == 1 )
    614     {
    615         for( ; i < len; i++ )
    616         {
    617             if( mask[i] )
    618                 dst[i] += (AT)src1[i]*src2[i];
    619         }
    620     }
    621     else if( cn == 3 )
    622     {
    623         for( ; i < len; i++, src1 += 3, src2 += 3, dst += 3 )
    624         {
    625             if( mask[i] )
    626             {
    627                 AT t0 = (AT)src1[0]*src2[0] + dst[0];
    628                 AT t1 = (AT)src1[1]*src2[1] + dst[1];
    629                 AT t2 = (AT)src1[2]*src2[2] + dst[2];
    630 
    631                 dst[0] = t0; dst[1] = t1; dst[2] = t2;
    632             }
    633         }
    634     }
    635     else
    636     {
    637         for( ; i < len; i++, src1 += cn, src2 += cn, dst += cn )
    638             if( mask[i] )
    639             {
    640                 for( int k = 0; k < cn; k++ )
    641                     dst[k] += (AT)src1[k]*src2[k];
    642             }
    643     }
    644 }
    645 
    646 
    647 template<typename T, typename AT> void
    648 accW_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha )
    649 {
    650     AT a = (AT)alpha, b = 1 - a;
    651     int i = AccW_SIMD<T, AT>()(src, dst, mask, len, cn, a);
    652 
    653     if( !mask )
    654     {
    655         len *= cn;
    656         #if CV_ENABLE_UNROLLED
    657         for( ; i <= len - 4; i += 4 )
    658         {
    659             AT t0, t1;
    660             t0 = src[i]*a + dst[i]*b;
    661             t1 = src[i+1]*a + dst[i+1]*b;
    662             dst[i] = t0; dst[i+1] = t1;
    663 
    664             t0 = src[i+2]*a + dst[i+2]*b;
    665             t1 = src[i+3]*a + dst[i+3]*b;
    666             dst[i+2] = t0; dst[i+3] = t1;
    667         }
    668         #endif
    669         for( ; i < len; i++ )
    670             dst[i] = src[i]*a + dst[i]*b;
    671     }
    672     else if( cn == 1 )
    673     {
    674         for( ; i < len; i++ )
    675         {
    676             if( mask[i] )
    677                 dst[i] = src[i]*a + dst[i]*b;
    678         }
    679     }
    680     else if( cn == 3 )
    681     {
    682         for( ; i < len; i++, src += 3, dst += 3 )
    683         {
    684             if( mask[i] )
    685             {
    686                 AT t0 = src[0]*a + dst[0]*b;
    687                 AT t1 = src[1]*a + dst[1]*b;
    688                 AT t2 = src[2]*a + dst[2]*b;
    689 
    690                 dst[0] = t0; dst[1] = t1; dst[2] = t2;
    691             }
    692         }
    693     }
    694     else
    695     {
    696         for( ; i < len; i++, src += cn, dst += cn )
    697             if( mask[i] )
    698             {
    699                 for( int k = 0; k < cn; k++ )
    700                     dst[k] = src[k]*a + dst[k]*b;
    701             }
    702     }
    703 }
    704 
    705 
    706 #define DEF_ACC_FUNCS(suffix, type, acctype) \
    707 static void acc_##suffix(const type* src, acctype* dst, \
    708                          const uchar* mask, int len, int cn) \
    709 { acc_(src, dst, mask, len, cn); } \
    710 \
    711 static void accSqr_##suffix(const type* src, acctype* dst, \
    712                             const uchar* mask, int len, int cn) \
    713 { accSqr_(src, dst, mask, len, cn); } \
    714 \
    715 static void accProd_##suffix(const type* src1, const type* src2, \
    716                              acctype* dst, const uchar* mask, int len, int cn) \
    717 { accProd_(src1, src2, dst, mask, len, cn); } \
    718 \
    719 static void accW_##suffix(const type* src, acctype* dst, \
    720                           const uchar* mask, int len, int cn, double alpha) \
    721 { accW_(src, dst, mask, len, cn, alpha); }
    722 
    723 
    724 DEF_ACC_FUNCS(8u32f, uchar, float)
    725 DEF_ACC_FUNCS(8u64f, uchar, double)
    726 DEF_ACC_FUNCS(16u32f, ushort, float)
    727 DEF_ACC_FUNCS(16u64f, ushort, double)
    728 DEF_ACC_FUNCS(32f, float, float)
    729 DEF_ACC_FUNCS(32f64f, float, double)
    730 DEF_ACC_FUNCS(64f, double, double)
    731 
    732 
    733 typedef void (*AccFunc)(const uchar*, uchar*, const uchar*, int, int);
    734 typedef void (*AccProdFunc)(const uchar*, const uchar*, uchar*, const uchar*, int, int);
    735 typedef void (*AccWFunc)(const uchar*, uchar*, const uchar*, int, int, double);
    736 
    737 static AccFunc accTab[] =
    738 {
    739     (AccFunc)acc_8u32f, (AccFunc)acc_8u64f,
    740     (AccFunc)acc_16u32f, (AccFunc)acc_16u64f,
    741     (AccFunc)acc_32f, (AccFunc)acc_32f64f,
    742     (AccFunc)acc_64f
    743 };
    744 
    745 static AccFunc accSqrTab[] =
    746 {
    747     (AccFunc)accSqr_8u32f, (AccFunc)accSqr_8u64f,
    748     (AccFunc)accSqr_16u32f, (AccFunc)accSqr_16u64f,
    749     (AccFunc)accSqr_32f, (AccFunc)accSqr_32f64f,
    750     (AccFunc)accSqr_64f
    751 };
    752 
    753 static AccProdFunc accProdTab[] =
    754 {
    755     (AccProdFunc)accProd_8u32f, (AccProdFunc)accProd_8u64f,
    756     (AccProdFunc)accProd_16u32f, (AccProdFunc)accProd_16u64f,
    757     (AccProdFunc)accProd_32f, (AccProdFunc)accProd_32f64f,
    758     (AccProdFunc)accProd_64f
    759 };
    760 
    761 static AccWFunc accWTab[] =
    762 {
    763     (AccWFunc)accW_8u32f, (AccWFunc)accW_8u64f,
    764     (AccWFunc)accW_16u32f, (AccWFunc)accW_16u64f,
    765     (AccWFunc)accW_32f, (AccWFunc)accW_32f64f,
    766     (AccWFunc)accW_64f
    767 };
    768 
    769 inline int getAccTabIdx(int sdepth, int ddepth)
    770 {
    771     return sdepth == CV_8U && ddepth == CV_32F ? 0 :
    772            sdepth == CV_8U && ddepth == CV_64F ? 1 :
    773            sdepth == CV_16U && ddepth == CV_32F ? 2 :
    774            sdepth == CV_16U && ddepth == CV_64F ? 3 :
    775            sdepth == CV_32F && ddepth == CV_32F ? 4 :
    776            sdepth == CV_32F && ddepth == CV_64F ? 5 :
    777            sdepth == CV_64F && ddepth == CV_64F ? 6 : -1;
    778 }
    779 
    780 #ifdef HAVE_OPENCL
    781 
    782 enum
    783 {
    784     ACCUMULATE = 0,
    785     ACCUMULATE_SQUARE = 1,
    786     ACCUMULATE_PRODUCT = 2,
    787     ACCUMULATE_WEIGHTED = 3
    788 };
    789 
    790 static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray _dst, double alpha,
    791                             InputArray _mask, int op_type )
    792 {
    793     CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
    794               op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
    795 
    796     const ocl::Device & dev = ocl::Device::getDefault();
    797     bool haveMask = !_mask.empty(), doubleSupport = dev.doubleFPConfig() > 0;
    798     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = _dst.depth();
    799     int kercn = haveMask ? cn : ocl::predictOptimalVectorWidthMax(_src, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
    800 
    801     if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
    802         return false;
    803 
    804     const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
    805                                    "ACCUMULATE_WEIGHTED" };
    806 
    807     char cvt[40];
    808     ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
    809                   format("-D %s%s -D srcT1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=%s",
    810                          opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
    811                          ocl::typeToStr(sdepth), kercn, ocl::typeToStr(ddepth),
    812                          doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI,
    813                          ocl::convertTypeStr(sdepth, ddepth, 1, cvt)));
    814     if (k.empty())
    815         return false;
    816 
    817     UMat src = _src.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(), mask = _mask.getUMat();
    818 
    819     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
    820             src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
    821             dstarg = ocl::KernelArg::ReadWrite(dst, cn, kercn),
    822             maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
    823 
    824     int argidx = k.set(0, srcarg);
    825     if (op_type == ACCUMULATE_PRODUCT)
    826         argidx = k.set(argidx, src2arg);
    827     argidx = k.set(argidx, dstarg);
    828     if (op_type == ACCUMULATE_WEIGHTED)
    829     {
    830         if (ddepth == CV_32F)
    831             argidx = k.set(argidx, (float)alpha);
    832         else
    833             argidx = k.set(argidx, alpha);
    834     }
    835     if (haveMask)
    836         k.set(argidx, maskarg);
    837 
    838     size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
    839     return k.run(2, globalsize, NULL, false);
    840 }
    841 
    842 #endif
    843 
    844 }
    845 
    846 void cv::accumulate( InputArray _src, InputOutputArray _dst, InputArray _mask )
    847 {
    848     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
    849     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
    850 
    851     CV_Assert( _src.sameSize(_dst) && dcn == scn );
    852     CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
    853 
    854     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
    855                ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE))
    856 
    857     Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
    858 
    859 #if defined HAVE_IPP
    860     CV_IPP_CHECK()
    861     {
    862         if (src.dims <= 2 || (src.isContinuous() && dst.isContinuous() && (mask.empty() || mask.isContinuous())))
    863         {
    864             typedef IppStatus (CV_STDCALL * ippiAdd)(const void * pSrc, int srcStep, Ipp32f * pSrcDst, int srcdstStep, IppiSize roiSize);
    865             typedef IppStatus (CV_STDCALL * ippiAddMask)(const void * pSrc, int srcStep, const Ipp8u * pMask, int maskStep, Ipp32f * pSrcDst,
    866                                                         int srcDstStep, IppiSize roiSize);
    867             ippiAdd ippFunc = 0;
    868             ippiAddMask ippFuncMask = 0;
    869 
    870             if (mask.empty())
    871             {
    872                 CV_SUPPRESS_DEPRECATED_START
    873                 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAdd)ippiAdd_8u32f_C1IR :
    874                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAdd)ippiAdd_16u32f_C1IR :
    875                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAdd)ippiAdd_32f_C1IR : 0;
    876                 CV_SUPPRESS_DEPRECATED_END
    877             }
    878             else if (scn == 1)
    879             {
    880                 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddMask)ippiAdd_8u32f_C1IMR :
    881                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddMask)ippiAdd_16u32f_C1IMR :
    882                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddMask)ippiAdd_32f_C1IMR : 0;
    883             }
    884 
    885             if (ippFunc || ippFuncMask)
    886             {
    887                 IppStatus status = ippStsNoErr;
    888 
    889                 Size size = src.size();
    890                 int srcstep = (int)src.step, dststep = (int)dst.step, maskstep = (int)mask.step;
    891                 if (src.isContinuous() && dst.isContinuous() && mask.isContinuous())
    892                 {
    893                     srcstep = static_cast<int>(src.total() * src.elemSize());
    894                     dststep = static_cast<int>(dst.total() * dst.elemSize());
    895                     maskstep = static_cast<int>(mask.total() * mask.elemSize());
    896                     size.width = static_cast<int>(src.total());
    897                     size.height = 1;
    898                 }
    899                 size.width *= scn;
    900 
    901                 if (mask.empty())
    902                     status = ippFunc(src.ptr(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
    903                 else
    904                     status = ippFuncMask(src.ptr(), srcstep, mask.ptr<Ipp8u>(), maskstep,
    905                                          dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
    906 
    907                 if (status >= 0)
    908                 {
    909                     CV_IMPL_ADD(CV_IMPL_IPP);
    910                     return;
    911                 }
    912                 setIppErrorStatus();
    913             }
    914         }
    915     }
    916 #endif
    917 
    918     int fidx = getAccTabIdx(sdepth, ddepth);
    919     AccFunc func = fidx >= 0 ? accTab[fidx] : 0;
    920     CV_Assert( func != 0 );
    921 
    922     const Mat* arrays[] = {&src, &dst, &mask, 0};
    923     uchar* ptrs[3];
    924     NAryMatIterator it(arrays, ptrs);
    925     int len = (int)it.size;
    926 
    927     for( size_t i = 0; i < it.nplanes; i++, ++it )
    928         func(ptrs[0], ptrs[1], ptrs[2], len, scn);
    929 }
    930 
    931 void cv::accumulateSquare( InputArray _src, InputOutputArray _dst, InputArray _mask )
    932 {
    933     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
    934     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
    935 
    936     CV_Assert( _src.sameSize(_dst) && dcn == scn );
    937     CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
    938 
    939     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
    940                ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE_SQUARE))
    941 
    942     Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
    943 
    944 #if defined(HAVE_IPP)
    945     CV_IPP_CHECK()
    946     {
    947         if (src.dims <= 2 || (src.isContinuous() && dst.isContinuous() && (mask.empty() || mask.isContinuous())))
    948         {
    949             typedef IppStatus (CV_STDCALL * ippiAddSquare)(const void * pSrc, int srcStep, Ipp32f * pSrcDst, int srcdstStep, IppiSize roiSize);
    950             typedef IppStatus (CV_STDCALL * ippiAddSquareMask)(const void * pSrc, int srcStep, const Ipp8u * pMask, int maskStep, Ipp32f * pSrcDst,
    951                                                                int srcDstStep, IppiSize roiSize);
    952             ippiAddSquare ippFunc = 0;
    953             ippiAddSquareMask ippFuncMask = 0;
    954 
    955             if (mask.empty())
    956             {
    957                 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddSquare)ippiAddSquare_8u32f_C1IR :
    958                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddSquare)ippiAddSquare_16u32f_C1IR :
    959                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddSquare)ippiAddSquare_32f_C1IR : 0;
    960             }
    961             else if (scn == 1)
    962             {
    963                 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddSquareMask)ippiAddSquare_8u32f_C1IMR :
    964                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddSquareMask)ippiAddSquare_16u32f_C1IMR :
    965                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddSquareMask)ippiAddSquare_32f_C1IMR : 0;
    966             }
    967 
    968             if (ippFunc || ippFuncMask)
    969             {
    970                 IppStatus status = ippStsNoErr;
    971 
    972                 Size size = src.size();
    973                 int srcstep = (int)src.step, dststep = (int)dst.step, maskstep = (int)mask.step;
    974                 if (src.isContinuous() && dst.isContinuous() && mask.isContinuous())
    975                 {
    976                     srcstep = static_cast<int>(src.total() * src.elemSize());
    977                     dststep = static_cast<int>(dst.total() * dst.elemSize());
    978                     maskstep = static_cast<int>(mask.total() * mask.elemSize());
    979                     size.width = static_cast<int>(src.total());
    980                     size.height = 1;
    981                 }
    982                 size.width *= scn;
    983 
    984                 if (mask.empty())
    985                     status = ippFunc(src.ptr(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
    986                 else
    987                     status = ippFuncMask(src.ptr(), srcstep, mask.ptr<Ipp8u>(), maskstep,
    988                                          dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
    989 
    990                 if (status >= 0)
    991                 {
    992                     CV_IMPL_ADD(CV_IMPL_IPP);
    993                     return;
    994                 }
    995                 setIppErrorStatus();
    996             }
    997         }
    998     }
    999 #endif
   1000 
   1001     int fidx = getAccTabIdx(sdepth, ddepth);
   1002     AccFunc func = fidx >= 0 ? accSqrTab[fidx] : 0;
   1003     CV_Assert( func != 0 );
   1004 
   1005     const Mat* arrays[] = {&src, &dst, &mask, 0};
   1006     uchar* ptrs[3];
   1007     NAryMatIterator it(arrays, ptrs);
   1008     int len = (int)it.size;
   1009 
   1010     for( size_t i = 0; i < it.nplanes; i++, ++it )
   1011         func(ptrs[0], ptrs[1], ptrs[2], len, scn);
   1012 }
   1013 
   1014 void cv::accumulateProduct( InputArray _src1, InputArray _src2,
   1015                             InputOutputArray _dst, InputArray _mask )
   1016 {
   1017     int stype = _src1.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
   1018     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
   1019 
   1020     CV_Assert( _src1.sameSize(_src2) && stype == _src2.type() );
   1021     CV_Assert( _src1.sameSize(_dst) && dcn == scn );
   1022     CV_Assert( _mask.empty() || (_src1.sameSize(_mask) && _mask.type() == CV_8U) );
   1023 
   1024     CV_OCL_RUN(_src1.dims() <= 2 && _dst.isUMat(),
   1025                ocl_accumulate(_src1, _src2, _dst, 0.0, _mask, ACCUMULATE_PRODUCT))
   1026 
   1027     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
   1028 
   1029 #if defined(HAVE_IPP)
   1030     CV_IPP_CHECK()
   1031     {
   1032         if (src1.dims <= 2 || (src1.isContinuous() && src2.isContinuous() && dst.isContinuous()))
   1033         {
   1034             typedef IppStatus (CV_STDCALL * ippiAddProduct)(const void * pSrc1, int src1Step, const void * pSrc2,
   1035                                                             int src2Step, Ipp32f * pSrcDst, int srcDstStep, IppiSize roiSize);
   1036             typedef IppStatus (CV_STDCALL * ippiAddProductMask)(const void * pSrc1, int src1Step, const void * pSrc2, int src2Step,
   1037                                                                 const Ipp8u * pMask, int maskStep, Ipp32f * pSrcDst, int srcDstStep, IppiSize roiSize);
   1038             ippiAddProduct ippFunc = 0;
   1039             ippiAddProductMask ippFuncMask = 0;
   1040 
   1041             if (mask.empty())
   1042             {
   1043                 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddProduct)ippiAddProduct_8u32f_C1IR :
   1044                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddProduct)ippiAddProduct_16u32f_C1IR :
   1045                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddProduct)ippiAddProduct_32f_C1IR : 0;
   1046             }
   1047             else if (scn == 1)
   1048             {
   1049                 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddProductMask)ippiAddProduct_8u32f_C1IMR :
   1050                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddProductMask)ippiAddProduct_16u32f_C1IMR :
   1051                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddProductMask)ippiAddProduct_32f_C1IMR : 0;
   1052             }
   1053 
   1054             if (ippFunc || ippFuncMask)
   1055             {
   1056                 IppStatus status = ippStsNoErr;
   1057 
   1058                 Size size = src1.size();
   1059                 int src1step = (int)src1.step, src2step = (int)src2.step, dststep = (int)dst.step, maskstep = (int)mask.step;
   1060                 if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous() && mask.isContinuous())
   1061                 {
   1062                     src1step = static_cast<int>(src1.total() * src1.elemSize());
   1063                     src2step = static_cast<int>(src2.total() * src2.elemSize());
   1064                     dststep = static_cast<int>(dst.total() * dst.elemSize());
   1065                     maskstep = static_cast<int>(mask.total() * mask.elemSize());
   1066                     size.width = static_cast<int>(src1.total());
   1067                     size.height = 1;
   1068                 }
   1069                 size.width *= scn;
   1070 
   1071                 if (mask.empty())
   1072                     status = ippFunc(src1.ptr(), src1step, src2.ptr(), src2step, dst.ptr<Ipp32f>(),
   1073                                      dststep, ippiSize(size.width, size.height));
   1074                 else
   1075                     status = ippFuncMask(src1.ptr(), src1step, src2.ptr(), src2step, mask.ptr<Ipp8u>(), maskstep,
   1076                                          dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
   1077 
   1078                 if (status >= 0)
   1079                 {
   1080                     CV_IMPL_ADD(CV_IMPL_IPP);
   1081                     return;
   1082                 }
   1083                 setIppErrorStatus();
   1084             }
   1085         }
   1086     }
   1087 #endif
   1088 
   1089     int fidx = getAccTabIdx(sdepth, ddepth);
   1090     AccProdFunc func = fidx >= 0 ? accProdTab[fidx] : 0;
   1091     CV_Assert( func != 0 );
   1092 
   1093     const Mat* arrays[] = {&src1, &src2, &dst, &mask, 0};
   1094     uchar* ptrs[4];
   1095     NAryMatIterator it(arrays, ptrs);
   1096     int len = (int)it.size;
   1097 
   1098     for( size_t i = 0; i < it.nplanes; i++, ++it )
   1099         func(ptrs[0], ptrs[1], ptrs[2], ptrs[3], len, scn);
   1100 }
   1101 
   1102 void cv::accumulateWeighted( InputArray _src, InputOutputArray _dst,
   1103                              double alpha, InputArray _mask )
   1104 {
   1105     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
   1106     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
   1107 
   1108     CV_Assert( _src.sameSize(_dst) && dcn == scn );
   1109     CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
   1110 
   1111     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
   1112                ocl_accumulate(_src, noArray(), _dst, alpha, _mask, ACCUMULATE_WEIGHTED))
   1113 
   1114     Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
   1115 
   1116 #if defined(HAVE_IPP)
   1117     CV_IPP_CHECK()
   1118     {
   1119         if (src.dims <= 2 || (src.isContinuous() && dst.isContinuous() && mask.isContinuous()))
   1120         {
   1121             typedef IppStatus (CV_STDCALL * ippiAddWeighted)(const void * pSrc, int srcStep, Ipp32f * pSrcDst, int srcdstStep,
   1122                                                              IppiSize roiSize, Ipp32f alpha);
   1123             typedef IppStatus (CV_STDCALL * ippiAddWeightedMask)(const void * pSrc, int srcStep, const Ipp8u * pMask,
   1124                                                                  int maskStep, Ipp32f * pSrcDst,
   1125                                                                  int srcDstStep, IppiSize roiSize, Ipp32f alpha);
   1126             ippiAddWeighted ippFunc = 0;
   1127             ippiAddWeightedMask ippFuncMask = 0;
   1128 
   1129             if (mask.empty())
   1130             {
   1131                 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddWeighted)ippiAddWeighted_8u32f_C1IR :
   1132                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddWeighted)ippiAddWeighted_16u32f_C1IR :
   1133                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddWeighted)ippiAddWeighted_32f_C1IR : 0;
   1134             }
   1135             else if (scn == 1)
   1136             {
   1137                 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddWeightedMask)ippiAddWeighted_8u32f_C1IMR :
   1138                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddWeightedMask)ippiAddWeighted_16u32f_C1IMR :
   1139                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddWeightedMask)ippiAddWeighted_32f_C1IMR : 0;
   1140             }
   1141 
   1142             if (ippFunc || ippFuncMask)
   1143             {
   1144                 IppStatus status = ippStsNoErr;
   1145 
   1146                 Size size = src.size();
   1147                 int srcstep = (int)src.step, dststep = (int)dst.step, maskstep = (int)mask.step;
   1148                 if (src.isContinuous() && dst.isContinuous() && mask.isContinuous())
   1149                 {
   1150                     srcstep = static_cast<int>(src.total() * src.elemSize());
   1151                     dststep = static_cast<int>(dst.total() * dst.elemSize());
   1152                     maskstep = static_cast<int>(mask.total() * mask.elemSize());
   1153                     size.width = static_cast<int>((int)src.total());
   1154                     size.height = 1;
   1155                 }
   1156                 size.width *= scn;
   1157 
   1158                 if (mask.empty())
   1159                     status = ippFunc(src.ptr(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height), (Ipp32f)alpha);
   1160                 else
   1161                     status = ippFuncMask(src.ptr(), srcstep, mask.ptr<Ipp8u>(), maskstep,
   1162                                          dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height), (Ipp32f)alpha);
   1163 
   1164                 if (status >= 0)
   1165                 {
   1166                     CV_IMPL_ADD(CV_IMPL_IPP);
   1167                     return;
   1168                 }
   1169                 setIppErrorStatus();
   1170             }
   1171         }
   1172     }
   1173 #endif
   1174 
   1175     int fidx = getAccTabIdx(sdepth, ddepth);
   1176     AccWFunc func = fidx >= 0 ? accWTab[fidx] : 0;
   1177     CV_Assert( func != 0 );
   1178 
   1179     const Mat* arrays[] = {&src, &dst, &mask, 0};
   1180     uchar* ptrs[3];
   1181     NAryMatIterator it(arrays, ptrs);
   1182     int len = (int)it.size;
   1183 
   1184     for( size_t i = 0; i < it.nplanes; i++, ++it )
   1185         func(ptrs[0], ptrs[1], ptrs[2], len, scn, alpha);
   1186 }
   1187 
   1188 
   1189 CV_IMPL void
   1190 cvAcc( const void* arr, void* sumarr, const void* maskarr )
   1191 {
   1192     cv::Mat src = cv::cvarrToMat(arr), dst = cv::cvarrToMat(sumarr), mask;
   1193     if( maskarr )
   1194         mask = cv::cvarrToMat(maskarr);
   1195     cv::accumulate( src, dst, mask );
   1196 }
   1197 
   1198 CV_IMPL void
   1199 cvSquareAcc( const void* arr, void* sumarr, const void* maskarr )
   1200 {
   1201     cv::Mat src = cv::cvarrToMat(arr), dst = cv::cvarrToMat(sumarr), mask;
   1202     if( maskarr )
   1203         mask = cv::cvarrToMat(maskarr);
   1204     cv::accumulateSquare( src, dst, mask );
   1205 }
   1206 
   1207 CV_IMPL void
   1208 cvMultiplyAcc( const void* arr1, const void* arr2,
   1209                void* sumarr, const void* maskarr )
   1210 {
   1211     cv::Mat src1 = cv::cvarrToMat(arr1), src2 = cv::cvarrToMat(arr2);
   1212     cv::Mat dst = cv::cvarrToMat(sumarr), mask;
   1213     if( maskarr )
   1214         mask = cv::cvarrToMat(maskarr);
   1215     cv::accumulateProduct( src1, src2, dst, mask );
   1216 }
   1217 
   1218 CV_IMPL void
   1219 cvRunningAvg( const void* arr, void* sumarr, double alpha, const void* maskarr )
   1220 {
   1221     cv::Mat src = cv::cvarrToMat(arr), dst = cv::cvarrToMat(sumarr), mask;
   1222     if( maskarr )
   1223         mask = cv::cvarrToMat(maskarr);
   1224     cv::accumulateWeighted( src, dst, alpha, mask );
   1225 }
   1226 
   1227 /* End of file. */
   1228