1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2014, Itseez Inc., all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 / 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #include "precomp.hpp" 45 #include "opencl_kernels_imgproc.hpp" 46 47 namespace cv 48 { 49 50 template <typename T, typename AT> 51 struct Acc_SIMD 52 { 53 int operator() (const T *, AT *, const uchar *, int, int) const 54 { 55 return 0; 56 } 57 }; 58 59 template <typename T, typename AT> 60 struct AccSqr_SIMD 61 { 62 int operator() (const T *, AT *, const uchar *, int, int) const 63 { 64 return 0; 65 } 66 }; 67 68 template <typename T, typename AT> 69 struct AccProd_SIMD 70 { 71 int operator() (const T *, const T *, AT *, const uchar *, int, int) const 72 { 73 return 0; 74 } 75 }; 76 77 template <typename T, typename AT> 78 struct AccW_SIMD 79 { 80 int operator() (const T *, AT *, const uchar *, int, int, AT) const 81 { 82 return 0; 83 } 84 }; 85 86 #if CV_NEON 87 88 template <> 89 struct Acc_SIMD<uchar, float> 90 { 91 int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const 92 { 93 int x = 0; 94 95 if (!mask) 96 { 97 len *= cn; 98 for ( ; x <= len - 16; x += 16) 99 { 100 uint8x16_t v_src = vld1q_u8(src + x); 101 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src)); 102 103 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); 104 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); 105 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); 106 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); 107 } 108 } 109 else if (cn == 1) 110 { 111 uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0); 112 113 for ( ; x <= len - 16; x += 16) 114 { 115 uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0))); 116 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src)); 117 118 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); 119 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); 120 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); 121 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); 122 } 123 } 124 125 return x; 126 } 127 }; 128 129 template <> 130 struct Acc_SIMD<ushort, float> 131 { 132 int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const 133 { 134 int x = 0; 135 136 if (!mask) 137 { 138 len *= cn; 139 for ( ; x <= len - 8; x += 8) 140 { 141 uint16x8_t v_src = vld1q_u16(src + x); 142 uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src)); 143 144 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); 145 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); 146 } 147 } 148 149 return x; 150 } 151 }; 152 153 template <> 154 struct Acc_SIMD<float, float> 155 { 156 int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const 157 { 158 int x = 0; 159 160 if (!mask) 161 { 162 len *= cn; 163 for ( ; x <= len - 8; x += 8) 164 { 165 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vld1q_f32(src + x))); 166 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src + x + 4))); 167 } 168 } 169 170 return x; 171 } 172 }; 173 174 template <> 175 struct AccSqr_SIMD<uchar, float> 176 { 177 int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const 178 { 179 int x = 0; 180 181 if (!mask) 182 { 183 len *= cn; 184 for ( ; x <= len - 16; x += 16) 185 { 186 uint8x16_t v_src = vld1q_u8(src + x); 187 uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src); 188 uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1); 189 190 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); 191 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); 192 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); 193 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); 194 } 195 } 196 else if (cn == 1) 197 { 198 uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0); 199 200 for ( ; x <= len - 16; x += 16) 201 { 202 uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0))); 203 uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src); 204 uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1); 205 206 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); 207 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); 208 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); 209 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); 210 } 211 } 212 213 return x; 214 } 215 }; 216 217 template <> 218 struct AccSqr_SIMD<ushort, float> 219 { 220 int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const 221 { 222 int x = 0; 223 224 if (!mask) 225 { 226 len *= cn; 227 for ( ; x <= len - 8; x += 8) 228 { 229 uint16x8_t v_src = vld1q_u16(src + x); 230 uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src); 231 uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1); 232 233 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); 234 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); 235 } 236 } 237 else if (cn == 1) 238 { 239 uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0); 240 241 for ( ; x <= len - 8; x += 8) 242 { 243 uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0)); 244 uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src); 245 uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])), 246 v_src = vandq_u16(vld1q_u16(src + x), v_mask); 247 248 uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src); 249 uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1); 250 251 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); 252 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); 253 } 254 } 255 256 return x; 257 } 258 }; 259 260 template <> 261 struct AccSqr_SIMD<float, float> 262 { 263 int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const 264 { 265 int x = 0; 266 267 if (!mask) 268 { 269 len *= cn; 270 for ( ; x <= len - 8; x += 8) 271 { 272 float32x4_t v_src = vld1q_f32(src + x); 273 vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), v_src, v_src)); 274 275 v_src = vld1q_f32(src + x + 4); 276 vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), v_src, v_src)); 277 } 278 } 279 280 return x; 281 } 282 }; 283 284 template <> 285 struct AccProd_SIMD<uchar, float> 286 { 287 int operator() (const uchar * src1, const uchar * src2, float * dst, const uchar * mask, int len, int cn) const 288 { 289 int x = 0; 290 291 if (!mask) 292 { 293 len *= cn; 294 for ( ; x <= len - 16; x += 16) 295 { 296 uint8x16_t v_1src = vld1q_u8(src1 + x), v_2src = vld1q_u8(src2 + x); 297 uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)), 298 v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src)); 299 300 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); 301 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); 302 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); 303 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); 304 } 305 } 306 else if (cn == 1) 307 { 308 uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0); 309 310 for ( ; x <= len - 16; x += 16) 311 { 312 uint8x16_t v_mask = veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)); 313 uint8x16_t v_1src = vandq_u8(vld1q_u8(src1 + x), v_mask), v_2src = vandq_u8(vld1q_u8(src2 + x), v_mask); 314 uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)), 315 v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src)); 316 317 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); 318 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); 319 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); 320 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); 321 } 322 } 323 324 return x; 325 } 326 }; 327 328 template <> 329 struct AccProd_SIMD<ushort, float> 330 { 331 int operator() (const ushort * src1, const ushort * src2, float * dst, const uchar * mask, int len, int cn) const 332 { 333 int x = 0; 334 335 if (!mask) 336 { 337 len *= cn; 338 for ( ; x <= len - 8; x += 8) 339 { 340 uint16x8_t v_1src = vld1q_u16(src1 + x), v_2src = vld1q_u16(src2 + x); 341 uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)), 342 v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src)); 343 344 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); 345 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); 346 } 347 } 348 else if (cn == 1) 349 { 350 uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0); 351 352 for ( ; x <= len - 8; x += 8) 353 { 354 uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0)); 355 uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src); 356 uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])), 357 v_1src = vandq_u16(vld1q_u16(src1 + x), v_mask), 358 v_2src = vandq_u16(vld1q_u16(src2 + x), v_mask); 359 360 uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)), 361 v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src)); 362 363 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); 364 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); 365 } 366 } 367 368 return x; 369 } 370 }; 371 372 template <> 373 struct AccProd_SIMD<float, float> 374 { 375 int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const 376 { 377 int x = 0; 378 379 if (!mask) 380 { 381 len *= cn; 382 for ( ; x <= len - 8; x += 8) 383 { 384 vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), vld1q_f32(src1 + x), vld1q_f32(src2 + x))); 385 vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4))); 386 } 387 } 388 389 return x; 390 } 391 }; 392 393 template <> 394 struct AccW_SIMD<uchar, float> 395 { 396 int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn, float alpha) const 397 { 398 int x = 0; 399 float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha); 400 401 if (!mask) 402 { 403 len *= cn; 404 for ( ; x <= len - 16; x += 16) 405 { 406 uint8x16_t v_src = vld1q_u8(src + x); 407 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src)); 408 409 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), 410 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), v_alpha)); 411 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), 412 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), v_alpha)); 413 vst1q_f32(dst + x + 8, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 8), v_beta), 414 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_alpha)); 415 vst1q_f32(dst + x + 12, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 12), v_beta), 416 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_alpha)); 417 } 418 } 419 420 return x; 421 } 422 }; 423 424 template <> 425 struct AccW_SIMD<ushort, float> 426 { 427 int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn, float alpha) const 428 { 429 int x = 0; 430 float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha); 431 432 if (!mask) 433 { 434 len *= cn; 435 for ( ; x <= len - 8; x += 8) 436 { 437 uint16x8_t v_src = vld1q_u16(src + x); 438 uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src)); 439 440 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vcvtq_f32_u32(v_src0), v_alpha)); 441 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vcvtq_f32_u32(v_src1), v_alpha)); 442 } 443 } 444 445 return x; 446 } 447 }; 448 449 template <> 450 struct AccW_SIMD<float, float> 451 { 452 int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const 453 { 454 int x = 0; 455 float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha); 456 457 if (!mask) 458 { 459 len *= cn; 460 for ( ; x <= len - 8; x += 8) 461 { 462 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vld1q_f32(src + x), v_alpha)); 463 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vld1q_f32(src + x + 4), v_alpha)); 464 } 465 } 466 467 return x; 468 } 469 }; 470 471 #endif 472 473 template<typename T, typename AT> void 474 acc_( const T* src, AT* dst, const uchar* mask, int len, int cn ) 475 { 476 int i = Acc_SIMD<T, AT>()(src, dst, mask, len, cn); 477 478 if( !mask ) 479 { 480 len *= cn; 481 #if CV_ENABLE_UNROLLED 482 for( ; i <= len - 4; i += 4 ) 483 { 484 AT t0, t1; 485 t0 = src[i] + dst[i]; 486 t1 = src[i+1] + dst[i+1]; 487 dst[i] = t0; dst[i+1] = t1; 488 489 t0 = src[i+2] + dst[i+2]; 490 t1 = src[i+3] + dst[i+3]; 491 dst[i+2] = t0; dst[i+3] = t1; 492 } 493 #endif 494 for( ; i < len; i++ ) 495 dst[i] += src[i]; 496 } 497 else if( cn == 1 ) 498 { 499 for( ; i < len; i++ ) 500 { 501 if( mask[i] ) 502 dst[i] += src[i]; 503 } 504 } 505 else if( cn == 3 ) 506 { 507 for( ; i < len; i++, src += 3, dst += 3 ) 508 { 509 if( mask[i] ) 510 { 511 AT t0 = src[0] + dst[0]; 512 AT t1 = src[1] + dst[1]; 513 AT t2 = src[2] + dst[2]; 514 515 dst[0] = t0; dst[1] = t1; dst[2] = t2; 516 } 517 } 518 } 519 else 520 { 521 for( ; i < len; i++, src += cn, dst += cn ) 522 if( mask[i] ) 523 { 524 for( int k = 0; k < cn; k++ ) 525 dst[k] += src[k]; 526 } 527 } 528 } 529 530 531 template<typename T, typename AT> void 532 accSqr_( const T* src, AT* dst, const uchar* mask, int len, int cn ) 533 { 534 int i = AccSqr_SIMD<T, AT>()(src, dst, mask, len, cn); 535 536 if( !mask ) 537 { 538 len *= cn; 539 #if CV_ENABLE_UNROLLED 540 for( ; i <= len - 4; i += 4 ) 541 { 542 AT t0, t1; 543 t0 = (AT)src[i]*src[i] + dst[i]; 544 t1 = (AT)src[i+1]*src[i+1] + dst[i+1]; 545 dst[i] = t0; dst[i+1] = t1; 546 547 t0 = (AT)src[i+2]*src[i+2] + dst[i+2]; 548 t1 = (AT)src[i+3]*src[i+3] + dst[i+3]; 549 dst[i+2] = t0; dst[i+3] = t1; 550 } 551 #endif 552 for( ; i < len; i++ ) 553 dst[i] += (AT)src[i]*src[i]; 554 } 555 else if( cn == 1 ) 556 { 557 for( ; i < len; i++ ) 558 { 559 if( mask[i] ) 560 dst[i] += (AT)src[i]*src[i]; 561 } 562 } 563 else if( cn == 3 ) 564 { 565 for( ; i < len; i++, src += 3, dst += 3 ) 566 { 567 if( mask[i] ) 568 { 569 AT t0 = (AT)src[0]*src[0] + dst[0]; 570 AT t1 = (AT)src[1]*src[1] + dst[1]; 571 AT t2 = (AT)src[2]*src[2] + dst[2]; 572 573 dst[0] = t0; dst[1] = t1; dst[2] = t2; 574 } 575 } 576 } 577 else 578 { 579 for( ; i < len; i++, src += cn, dst += cn ) 580 if( mask[i] ) 581 { 582 for( int k = 0; k < cn; k++ ) 583 dst[k] += (AT)src[k]*src[k]; 584 } 585 } 586 } 587 588 589 template<typename T, typename AT> void 590 accProd_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn ) 591 { 592 int i = AccProd_SIMD<T, AT>()(src1, src2, dst, mask, len, cn); 593 594 if( !mask ) 595 { 596 len *= cn; 597 #if CV_ENABLE_UNROLLED 598 for( ; i <= len - 4; i += 4 ) 599 { 600 AT t0, t1; 601 t0 = (AT)src1[i]*src2[i] + dst[i]; 602 t1 = (AT)src1[i+1]*src2[i+1] + dst[i+1]; 603 dst[i] = t0; dst[i+1] = t1; 604 605 t0 = (AT)src1[i+2]*src2[i+2] + dst[i+2]; 606 t1 = (AT)src1[i+3]*src2[i+3] + dst[i+3]; 607 dst[i+2] = t0; dst[i+3] = t1; 608 } 609 #endif 610 for( ; i < len; i++ ) 611 dst[i] += (AT)src1[i]*src2[i]; 612 } 613 else if( cn == 1 ) 614 { 615 for( ; i < len; i++ ) 616 { 617 if( mask[i] ) 618 dst[i] += (AT)src1[i]*src2[i]; 619 } 620 } 621 else if( cn == 3 ) 622 { 623 for( ; i < len; i++, src1 += 3, src2 += 3, dst += 3 ) 624 { 625 if( mask[i] ) 626 { 627 AT t0 = (AT)src1[0]*src2[0] + dst[0]; 628 AT t1 = (AT)src1[1]*src2[1] + dst[1]; 629 AT t2 = (AT)src1[2]*src2[2] + dst[2]; 630 631 dst[0] = t0; dst[1] = t1; dst[2] = t2; 632 } 633 } 634 } 635 else 636 { 637 for( ; i < len; i++, src1 += cn, src2 += cn, dst += cn ) 638 if( mask[i] ) 639 { 640 for( int k = 0; k < cn; k++ ) 641 dst[k] += (AT)src1[k]*src2[k]; 642 } 643 } 644 } 645 646 647 template<typename T, typename AT> void 648 accW_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha ) 649 { 650 AT a = (AT)alpha, b = 1 - a; 651 int i = AccW_SIMD<T, AT>()(src, dst, mask, len, cn, a); 652 653 if( !mask ) 654 { 655 len *= cn; 656 #if CV_ENABLE_UNROLLED 657 for( ; i <= len - 4; i += 4 ) 658 { 659 AT t0, t1; 660 t0 = src[i]*a + dst[i]*b; 661 t1 = src[i+1]*a + dst[i+1]*b; 662 dst[i] = t0; dst[i+1] = t1; 663 664 t0 = src[i+2]*a + dst[i+2]*b; 665 t1 = src[i+3]*a + dst[i+3]*b; 666 dst[i+2] = t0; dst[i+3] = t1; 667 } 668 #endif 669 for( ; i < len; i++ ) 670 dst[i] = src[i]*a + dst[i]*b; 671 } 672 else if( cn == 1 ) 673 { 674 for( ; i < len; i++ ) 675 { 676 if( mask[i] ) 677 dst[i] = src[i]*a + dst[i]*b; 678 } 679 } 680 else if( cn == 3 ) 681 { 682 for( ; i < len; i++, src += 3, dst += 3 ) 683 { 684 if( mask[i] ) 685 { 686 AT t0 = src[0]*a + dst[0]*b; 687 AT t1 = src[1]*a + dst[1]*b; 688 AT t2 = src[2]*a + dst[2]*b; 689 690 dst[0] = t0; dst[1] = t1; dst[2] = t2; 691 } 692 } 693 } 694 else 695 { 696 for( ; i < len; i++, src += cn, dst += cn ) 697 if( mask[i] ) 698 { 699 for( int k = 0; k < cn; k++ ) 700 dst[k] = src[k]*a + dst[k]*b; 701 } 702 } 703 } 704 705 706 #define DEF_ACC_FUNCS(suffix, type, acctype) \ 707 static void acc_##suffix(const type* src, acctype* dst, \ 708 const uchar* mask, int len, int cn) \ 709 { acc_(src, dst, mask, len, cn); } \ 710 \ 711 static void accSqr_##suffix(const type* src, acctype* dst, \ 712 const uchar* mask, int len, int cn) \ 713 { accSqr_(src, dst, mask, len, cn); } \ 714 \ 715 static void accProd_##suffix(const type* src1, const type* src2, \ 716 acctype* dst, const uchar* mask, int len, int cn) \ 717 { accProd_(src1, src2, dst, mask, len, cn); } \ 718 \ 719 static void accW_##suffix(const type* src, acctype* dst, \ 720 const uchar* mask, int len, int cn, double alpha) \ 721 { accW_(src, dst, mask, len, cn, alpha); } 722 723 724 DEF_ACC_FUNCS(8u32f, uchar, float) 725 DEF_ACC_FUNCS(8u64f, uchar, double) 726 DEF_ACC_FUNCS(16u32f, ushort, float) 727 DEF_ACC_FUNCS(16u64f, ushort, double) 728 DEF_ACC_FUNCS(32f, float, float) 729 DEF_ACC_FUNCS(32f64f, float, double) 730 DEF_ACC_FUNCS(64f, double, double) 731 732 733 typedef void (*AccFunc)(const uchar*, uchar*, const uchar*, int, int); 734 typedef void (*AccProdFunc)(const uchar*, const uchar*, uchar*, const uchar*, int, int); 735 typedef void (*AccWFunc)(const uchar*, uchar*, const uchar*, int, int, double); 736 737 static AccFunc accTab[] = 738 { 739 (AccFunc)acc_8u32f, (AccFunc)acc_8u64f, 740 (AccFunc)acc_16u32f, (AccFunc)acc_16u64f, 741 (AccFunc)acc_32f, (AccFunc)acc_32f64f, 742 (AccFunc)acc_64f 743 }; 744 745 static AccFunc accSqrTab[] = 746 { 747 (AccFunc)accSqr_8u32f, (AccFunc)accSqr_8u64f, 748 (AccFunc)accSqr_16u32f, (AccFunc)accSqr_16u64f, 749 (AccFunc)accSqr_32f, (AccFunc)accSqr_32f64f, 750 (AccFunc)accSqr_64f 751 }; 752 753 static AccProdFunc accProdTab[] = 754 { 755 (AccProdFunc)accProd_8u32f, (AccProdFunc)accProd_8u64f, 756 (AccProdFunc)accProd_16u32f, (AccProdFunc)accProd_16u64f, 757 (AccProdFunc)accProd_32f, (AccProdFunc)accProd_32f64f, 758 (AccProdFunc)accProd_64f 759 }; 760 761 static AccWFunc accWTab[] = 762 { 763 (AccWFunc)accW_8u32f, (AccWFunc)accW_8u64f, 764 (AccWFunc)accW_16u32f, (AccWFunc)accW_16u64f, 765 (AccWFunc)accW_32f, (AccWFunc)accW_32f64f, 766 (AccWFunc)accW_64f 767 }; 768 769 inline int getAccTabIdx(int sdepth, int ddepth) 770 { 771 return sdepth == CV_8U && ddepth == CV_32F ? 0 : 772 sdepth == CV_8U && ddepth == CV_64F ? 1 : 773 sdepth == CV_16U && ddepth == CV_32F ? 2 : 774 sdepth == CV_16U && ddepth == CV_64F ? 3 : 775 sdepth == CV_32F && ddepth == CV_32F ? 4 : 776 sdepth == CV_32F && ddepth == CV_64F ? 5 : 777 sdepth == CV_64F && ddepth == CV_64F ? 6 : -1; 778 } 779 780 #ifdef HAVE_OPENCL 781 782 enum 783 { 784 ACCUMULATE = 0, 785 ACCUMULATE_SQUARE = 1, 786 ACCUMULATE_PRODUCT = 2, 787 ACCUMULATE_WEIGHTED = 3 788 }; 789 790 static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray _dst, double alpha, 791 InputArray _mask, int op_type ) 792 { 793 CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE || 794 op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED); 795 796 const ocl::Device & dev = ocl::Device::getDefault(); 797 bool haveMask = !_mask.empty(), doubleSupport = dev.doubleFPConfig() > 0; 798 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = _dst.depth(); 799 int kercn = haveMask ? cn : ocl::predictOptimalVectorWidthMax(_src, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1; 800 801 if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) 802 return false; 803 804 const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT", 805 "ACCUMULATE_WEIGHTED" }; 806 807 char cvt[40]; 808 ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc, 809 format("-D %s%s -D srcT1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=%s", 810 opMap[op_type], haveMask ? " -D HAVE_MASK" : "", 811 ocl::typeToStr(sdepth), kercn, ocl::typeToStr(ddepth), 812 doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI, 813 ocl::convertTypeStr(sdepth, ddepth, 1, cvt))); 814 if (k.empty()) 815 return false; 816 817 UMat src = _src.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(), mask = _mask.getUMat(); 818 819 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), 820 src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), 821 dstarg = ocl::KernelArg::ReadWrite(dst, cn, kercn), 822 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); 823 824 int argidx = k.set(0, srcarg); 825 if (op_type == ACCUMULATE_PRODUCT) 826 argidx = k.set(argidx, src2arg); 827 argidx = k.set(argidx, dstarg); 828 if (op_type == ACCUMULATE_WEIGHTED) 829 { 830 if (ddepth == CV_32F) 831 argidx = k.set(argidx, (float)alpha); 832 else 833 argidx = k.set(argidx, alpha); 834 } 835 if (haveMask) 836 k.set(argidx, maskarg); 837 838 size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI }; 839 return k.run(2, globalsize, NULL, false); 840 } 841 842 #endif 843 844 } 845 846 void cv::accumulate( InputArray _src, InputOutputArray _dst, InputArray _mask ) 847 { 848 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype); 849 int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype); 850 851 CV_Assert( _src.sameSize(_dst) && dcn == scn ); 852 CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) ); 853 854 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 855 ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE)) 856 857 Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat(); 858 859 #if defined HAVE_IPP 860 CV_IPP_CHECK() 861 { 862 if (src.dims <= 2 || (src.isContinuous() && dst.isContinuous() && (mask.empty() || mask.isContinuous()))) 863 { 864 typedef IppStatus (CV_STDCALL * ippiAdd)(const void * pSrc, int srcStep, Ipp32f * pSrcDst, int srcdstStep, IppiSize roiSize); 865 typedef IppStatus (CV_STDCALL * ippiAddMask)(const void * pSrc, int srcStep, const Ipp8u * pMask, int maskStep, Ipp32f * pSrcDst, 866 int srcDstStep, IppiSize roiSize); 867 ippiAdd ippFunc = 0; 868 ippiAddMask ippFuncMask = 0; 869 870 if (mask.empty()) 871 { 872 CV_SUPPRESS_DEPRECATED_START 873 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAdd)ippiAdd_8u32f_C1IR : 874 sdepth == CV_16U && ddepth == CV_32F ? (ippiAdd)ippiAdd_16u32f_C1IR : 875 sdepth == CV_32F && ddepth == CV_32F ? (ippiAdd)ippiAdd_32f_C1IR : 0; 876 CV_SUPPRESS_DEPRECATED_END 877 } 878 else if (scn == 1) 879 { 880 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddMask)ippiAdd_8u32f_C1IMR : 881 sdepth == CV_16U && ddepth == CV_32F ? (ippiAddMask)ippiAdd_16u32f_C1IMR : 882 sdepth == CV_32F && ddepth == CV_32F ? (ippiAddMask)ippiAdd_32f_C1IMR : 0; 883 } 884 885 if (ippFunc || ippFuncMask) 886 { 887 IppStatus status = ippStsNoErr; 888 889 Size size = src.size(); 890 int srcstep = (int)src.step, dststep = (int)dst.step, maskstep = (int)mask.step; 891 if (src.isContinuous() && dst.isContinuous() && mask.isContinuous()) 892 { 893 srcstep = static_cast<int>(src.total() * src.elemSize()); 894 dststep = static_cast<int>(dst.total() * dst.elemSize()); 895 maskstep = static_cast<int>(mask.total() * mask.elemSize()); 896 size.width = static_cast<int>(src.total()); 897 size.height = 1; 898 } 899 size.width *= scn; 900 901 if (mask.empty()) 902 status = ippFunc(src.ptr(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height)); 903 else 904 status = ippFuncMask(src.ptr(), srcstep, mask.ptr<Ipp8u>(), maskstep, 905 dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height)); 906 907 if (status >= 0) 908 { 909 CV_IMPL_ADD(CV_IMPL_IPP); 910 return; 911 } 912 setIppErrorStatus(); 913 } 914 } 915 } 916 #endif 917 918 int fidx = getAccTabIdx(sdepth, ddepth); 919 AccFunc func = fidx >= 0 ? accTab[fidx] : 0; 920 CV_Assert( func != 0 ); 921 922 const Mat* arrays[] = {&src, &dst, &mask, 0}; 923 uchar* ptrs[3]; 924 NAryMatIterator it(arrays, ptrs); 925 int len = (int)it.size; 926 927 for( size_t i = 0; i < it.nplanes; i++, ++it ) 928 func(ptrs[0], ptrs[1], ptrs[2], len, scn); 929 } 930 931 void cv::accumulateSquare( InputArray _src, InputOutputArray _dst, InputArray _mask ) 932 { 933 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype); 934 int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype); 935 936 CV_Assert( _src.sameSize(_dst) && dcn == scn ); 937 CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) ); 938 939 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 940 ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE_SQUARE)) 941 942 Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat(); 943 944 #if defined(HAVE_IPP) 945 CV_IPP_CHECK() 946 { 947 if (src.dims <= 2 || (src.isContinuous() && dst.isContinuous() && (mask.empty() || mask.isContinuous()))) 948 { 949 typedef IppStatus (CV_STDCALL * ippiAddSquare)(const void * pSrc, int srcStep, Ipp32f * pSrcDst, int srcdstStep, IppiSize roiSize); 950 typedef IppStatus (CV_STDCALL * ippiAddSquareMask)(const void * pSrc, int srcStep, const Ipp8u * pMask, int maskStep, Ipp32f * pSrcDst, 951 int srcDstStep, IppiSize roiSize); 952 ippiAddSquare ippFunc = 0; 953 ippiAddSquareMask ippFuncMask = 0; 954 955 if (mask.empty()) 956 { 957 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddSquare)ippiAddSquare_8u32f_C1IR : 958 sdepth == CV_16U && ddepth == CV_32F ? (ippiAddSquare)ippiAddSquare_16u32f_C1IR : 959 sdepth == CV_32F && ddepth == CV_32F ? (ippiAddSquare)ippiAddSquare_32f_C1IR : 0; 960 } 961 else if (scn == 1) 962 { 963 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddSquareMask)ippiAddSquare_8u32f_C1IMR : 964 sdepth == CV_16U && ddepth == CV_32F ? (ippiAddSquareMask)ippiAddSquare_16u32f_C1IMR : 965 sdepth == CV_32F && ddepth == CV_32F ? (ippiAddSquareMask)ippiAddSquare_32f_C1IMR : 0; 966 } 967 968 if (ippFunc || ippFuncMask) 969 { 970 IppStatus status = ippStsNoErr; 971 972 Size size = src.size(); 973 int srcstep = (int)src.step, dststep = (int)dst.step, maskstep = (int)mask.step; 974 if (src.isContinuous() && dst.isContinuous() && mask.isContinuous()) 975 { 976 srcstep = static_cast<int>(src.total() * src.elemSize()); 977 dststep = static_cast<int>(dst.total() * dst.elemSize()); 978 maskstep = static_cast<int>(mask.total() * mask.elemSize()); 979 size.width = static_cast<int>(src.total()); 980 size.height = 1; 981 } 982 size.width *= scn; 983 984 if (mask.empty()) 985 status = ippFunc(src.ptr(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height)); 986 else 987 status = ippFuncMask(src.ptr(), srcstep, mask.ptr<Ipp8u>(), maskstep, 988 dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height)); 989 990 if (status >= 0) 991 { 992 CV_IMPL_ADD(CV_IMPL_IPP); 993 return; 994 } 995 setIppErrorStatus(); 996 } 997 } 998 } 999 #endif 1000 1001 int fidx = getAccTabIdx(sdepth, ddepth); 1002 AccFunc func = fidx >= 0 ? accSqrTab[fidx] : 0; 1003 CV_Assert( func != 0 ); 1004 1005 const Mat* arrays[] = {&src, &dst, &mask, 0}; 1006 uchar* ptrs[3]; 1007 NAryMatIterator it(arrays, ptrs); 1008 int len = (int)it.size; 1009 1010 for( size_t i = 0; i < it.nplanes; i++, ++it ) 1011 func(ptrs[0], ptrs[1], ptrs[2], len, scn); 1012 } 1013 1014 void cv::accumulateProduct( InputArray _src1, InputArray _src2, 1015 InputOutputArray _dst, InputArray _mask ) 1016 { 1017 int stype = _src1.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype); 1018 int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype); 1019 1020 CV_Assert( _src1.sameSize(_src2) && stype == _src2.type() ); 1021 CV_Assert( _src1.sameSize(_dst) && dcn == scn ); 1022 CV_Assert( _mask.empty() || (_src1.sameSize(_mask) && _mask.type() == CV_8U) ); 1023 1024 CV_OCL_RUN(_src1.dims() <= 2 && _dst.isUMat(), 1025 ocl_accumulate(_src1, _src2, _dst, 0.0, _mask, ACCUMULATE_PRODUCT)) 1026 1027 Mat src1 = _src1.getMat(), src2 = _src2.getMat(), dst = _dst.getMat(), mask = _mask.getMat(); 1028 1029 #if defined(HAVE_IPP) 1030 CV_IPP_CHECK() 1031 { 1032 if (src1.dims <= 2 || (src1.isContinuous() && src2.isContinuous() && dst.isContinuous())) 1033 { 1034 typedef IppStatus (CV_STDCALL * ippiAddProduct)(const void * pSrc1, int src1Step, const void * pSrc2, 1035 int src2Step, Ipp32f * pSrcDst, int srcDstStep, IppiSize roiSize); 1036 typedef IppStatus (CV_STDCALL * ippiAddProductMask)(const void * pSrc1, int src1Step, const void * pSrc2, int src2Step, 1037 const Ipp8u * pMask, int maskStep, Ipp32f * pSrcDst, int srcDstStep, IppiSize roiSize); 1038 ippiAddProduct ippFunc = 0; 1039 ippiAddProductMask ippFuncMask = 0; 1040 1041 if (mask.empty()) 1042 { 1043 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddProduct)ippiAddProduct_8u32f_C1IR : 1044 sdepth == CV_16U && ddepth == CV_32F ? (ippiAddProduct)ippiAddProduct_16u32f_C1IR : 1045 sdepth == CV_32F && ddepth == CV_32F ? (ippiAddProduct)ippiAddProduct_32f_C1IR : 0; 1046 } 1047 else if (scn == 1) 1048 { 1049 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddProductMask)ippiAddProduct_8u32f_C1IMR : 1050 sdepth == CV_16U && ddepth == CV_32F ? (ippiAddProductMask)ippiAddProduct_16u32f_C1IMR : 1051 sdepth == CV_32F && ddepth == CV_32F ? (ippiAddProductMask)ippiAddProduct_32f_C1IMR : 0; 1052 } 1053 1054 if (ippFunc || ippFuncMask) 1055 { 1056 IppStatus status = ippStsNoErr; 1057 1058 Size size = src1.size(); 1059 int src1step = (int)src1.step, src2step = (int)src2.step, dststep = (int)dst.step, maskstep = (int)mask.step; 1060 if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous() && mask.isContinuous()) 1061 { 1062 src1step = static_cast<int>(src1.total() * src1.elemSize()); 1063 src2step = static_cast<int>(src2.total() * src2.elemSize()); 1064 dststep = static_cast<int>(dst.total() * dst.elemSize()); 1065 maskstep = static_cast<int>(mask.total() * mask.elemSize()); 1066 size.width = static_cast<int>(src1.total()); 1067 size.height = 1; 1068 } 1069 size.width *= scn; 1070 1071 if (mask.empty()) 1072 status = ippFunc(src1.ptr(), src1step, src2.ptr(), src2step, dst.ptr<Ipp32f>(), 1073 dststep, ippiSize(size.width, size.height)); 1074 else 1075 status = ippFuncMask(src1.ptr(), src1step, src2.ptr(), src2step, mask.ptr<Ipp8u>(), maskstep, 1076 dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height)); 1077 1078 if (status >= 0) 1079 { 1080 CV_IMPL_ADD(CV_IMPL_IPP); 1081 return; 1082 } 1083 setIppErrorStatus(); 1084 } 1085 } 1086 } 1087 #endif 1088 1089 int fidx = getAccTabIdx(sdepth, ddepth); 1090 AccProdFunc func = fidx >= 0 ? accProdTab[fidx] : 0; 1091 CV_Assert( func != 0 ); 1092 1093 const Mat* arrays[] = {&src1, &src2, &dst, &mask, 0}; 1094 uchar* ptrs[4]; 1095 NAryMatIterator it(arrays, ptrs); 1096 int len = (int)it.size; 1097 1098 for( size_t i = 0; i < it.nplanes; i++, ++it ) 1099 func(ptrs[0], ptrs[1], ptrs[2], ptrs[3], len, scn); 1100 } 1101 1102 void cv::accumulateWeighted( InputArray _src, InputOutputArray _dst, 1103 double alpha, InputArray _mask ) 1104 { 1105 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype); 1106 int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype); 1107 1108 CV_Assert( _src.sameSize(_dst) && dcn == scn ); 1109 CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) ); 1110 1111 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 1112 ocl_accumulate(_src, noArray(), _dst, alpha, _mask, ACCUMULATE_WEIGHTED)) 1113 1114 Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat(); 1115 1116 #if defined(HAVE_IPP) 1117 CV_IPP_CHECK() 1118 { 1119 if (src.dims <= 2 || (src.isContinuous() && dst.isContinuous() && mask.isContinuous())) 1120 { 1121 typedef IppStatus (CV_STDCALL * ippiAddWeighted)(const void * pSrc, int srcStep, Ipp32f * pSrcDst, int srcdstStep, 1122 IppiSize roiSize, Ipp32f alpha); 1123 typedef IppStatus (CV_STDCALL * ippiAddWeightedMask)(const void * pSrc, int srcStep, const Ipp8u * pMask, 1124 int maskStep, Ipp32f * pSrcDst, 1125 int srcDstStep, IppiSize roiSize, Ipp32f alpha); 1126 ippiAddWeighted ippFunc = 0; 1127 ippiAddWeightedMask ippFuncMask = 0; 1128 1129 if (mask.empty()) 1130 { 1131 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddWeighted)ippiAddWeighted_8u32f_C1IR : 1132 sdepth == CV_16U && ddepth == CV_32F ? (ippiAddWeighted)ippiAddWeighted_16u32f_C1IR : 1133 sdepth == CV_32F && ddepth == CV_32F ? (ippiAddWeighted)ippiAddWeighted_32f_C1IR : 0; 1134 } 1135 else if (scn == 1) 1136 { 1137 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddWeightedMask)ippiAddWeighted_8u32f_C1IMR : 1138 sdepth == CV_16U && ddepth == CV_32F ? (ippiAddWeightedMask)ippiAddWeighted_16u32f_C1IMR : 1139 sdepth == CV_32F && ddepth == CV_32F ? (ippiAddWeightedMask)ippiAddWeighted_32f_C1IMR : 0; 1140 } 1141 1142 if (ippFunc || ippFuncMask) 1143 { 1144 IppStatus status = ippStsNoErr; 1145 1146 Size size = src.size(); 1147 int srcstep = (int)src.step, dststep = (int)dst.step, maskstep = (int)mask.step; 1148 if (src.isContinuous() && dst.isContinuous() && mask.isContinuous()) 1149 { 1150 srcstep = static_cast<int>(src.total() * src.elemSize()); 1151 dststep = static_cast<int>(dst.total() * dst.elemSize()); 1152 maskstep = static_cast<int>(mask.total() * mask.elemSize()); 1153 size.width = static_cast<int>((int)src.total()); 1154 size.height = 1; 1155 } 1156 size.width *= scn; 1157 1158 if (mask.empty()) 1159 status = ippFunc(src.ptr(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height), (Ipp32f)alpha); 1160 else 1161 status = ippFuncMask(src.ptr(), srcstep, mask.ptr<Ipp8u>(), maskstep, 1162 dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height), (Ipp32f)alpha); 1163 1164 if (status >= 0) 1165 { 1166 CV_IMPL_ADD(CV_IMPL_IPP); 1167 return; 1168 } 1169 setIppErrorStatus(); 1170 } 1171 } 1172 } 1173 #endif 1174 1175 int fidx = getAccTabIdx(sdepth, ddepth); 1176 AccWFunc func = fidx >= 0 ? accWTab[fidx] : 0; 1177 CV_Assert( func != 0 ); 1178 1179 const Mat* arrays[] = {&src, &dst, &mask, 0}; 1180 uchar* ptrs[3]; 1181 NAryMatIterator it(arrays, ptrs); 1182 int len = (int)it.size; 1183 1184 for( size_t i = 0; i < it.nplanes; i++, ++it ) 1185 func(ptrs[0], ptrs[1], ptrs[2], len, scn, alpha); 1186 } 1187 1188 1189 CV_IMPL void 1190 cvAcc( const void* arr, void* sumarr, const void* maskarr ) 1191 { 1192 cv::Mat src = cv::cvarrToMat(arr), dst = cv::cvarrToMat(sumarr), mask; 1193 if( maskarr ) 1194 mask = cv::cvarrToMat(maskarr); 1195 cv::accumulate( src, dst, mask ); 1196 } 1197 1198 CV_IMPL void 1199 cvSquareAcc( const void* arr, void* sumarr, const void* maskarr ) 1200 { 1201 cv::Mat src = cv::cvarrToMat(arr), dst = cv::cvarrToMat(sumarr), mask; 1202 if( maskarr ) 1203 mask = cv::cvarrToMat(maskarr); 1204 cv::accumulateSquare( src, dst, mask ); 1205 } 1206 1207 CV_IMPL void 1208 cvMultiplyAcc( const void* arr1, const void* arr2, 1209 void* sumarr, const void* maskarr ) 1210 { 1211 cv::Mat src1 = cv::cvarrToMat(arr1), src2 = cv::cvarrToMat(arr2); 1212 cv::Mat dst = cv::cvarrToMat(sumarr), mask; 1213 if( maskarr ) 1214 mask = cv::cvarrToMat(maskarr); 1215 cv::accumulateProduct( src1, src2, dst, mask ); 1216 } 1217 1218 CV_IMPL void 1219 cvRunningAvg( const void* arr, void* sumarr, double alpha, const void* maskarr ) 1220 { 1221 cv::Mat src = cv::cvarrToMat(arr), dst = cv::cvarrToMat(sumarr), mask; 1222 if( maskarr ) 1223 mask = cv::cvarrToMat(maskarr); 1224 cv::accumulateWeighted( src, dst, alpha, mask ); 1225 } 1226 1227 /* End of file. */ 1228