1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 /* //////////////////////////////////////////////////////////////////// 45 // 46 // Arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ... 47 // 48 // */ 49 50 #include "precomp.hpp" 51 #include "opencl_kernels_core.hpp" 52 53 namespace cv 54 { 55 56 struct NOP {}; 57 58 #if CV_SSE2 || CV_NEON 59 60 #define FUNCTOR_TEMPLATE(name) \ 61 template<typename T> struct name {} 62 63 FUNCTOR_TEMPLATE(VLoadStore128); 64 #if CV_SSE2 65 FUNCTOR_TEMPLATE(VLoadStore64); 66 FUNCTOR_TEMPLATE(VLoadStore128Aligned); 67 #if CV_AVX2 68 FUNCTOR_TEMPLATE(VLoadStore256); 69 FUNCTOR_TEMPLATE(VLoadStore256Aligned); 70 #endif 71 #endif 72 73 #endif 74 75 template<typename T, class Op, class VOp> 76 void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) 77 { 78 #if CV_SSE2 || CV_NEON 79 VOp vop; 80 #endif 81 Op op; 82 83 for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), 84 src2 = (const T *)((const uchar *)src2 + step2), 85 dst = (T *)((uchar *)dst + step) ) 86 { 87 int x = 0; 88 89 #if CV_NEON || CV_SSE2 90 #if CV_AVX2 91 if( USE_AVX2 ) 92 { 93 for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) 94 { 95 typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x); 96 r0 = vop(r0, VLoadStore256<T>::load(src2 + x)); 97 VLoadStore256<T>::store(dst + x, r0); 98 } 99 } 100 #else 101 #if CV_SSE2 102 if( USE_SSE2 ) 103 { 104 #endif // CV_SSE2 105 for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) 106 { 107 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x ); 108 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T)); 109 r0 = vop(r0, VLoadStore128<T>::load(src2 + x )); 110 r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T))); 111 VLoadStore128<T>::store(dst + x , r0); 112 VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1); 113 } 114 #if CV_SSE2 115 } 116 #endif // CV_SSE2 117 #endif // CV_AVX2 118 #endif // CV_NEON || CV_SSE2 119 120 #if CV_AVX2 121 // nothing 122 #elif CV_SSE2 123 if( USE_SSE2 ) 124 { 125 for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) ) 126 { 127 typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x); 128 r = vop(r, VLoadStore64<T>::load(src2 + x)); 129 VLoadStore64<T>::store(dst + x, r); 130 } 131 } 132 #endif 133 134 #if CV_ENABLE_UNROLLED 135 for( ; x <= sz.width - 4; x += 4 ) 136 { 137 T v0 = op(src1[x], src2[x]); 138 T v1 = op(src1[x+1], src2[x+1]); 139 dst[x] = v0; dst[x+1] = v1; 140 v0 = op(src1[x+2], src2[x+2]); 141 v1 = op(src1[x+3], src2[x+3]); 142 dst[x+2] = v0; dst[x+3] = v1; 143 } 144 #endif 145 146 for( ; x < sz.width; x++ ) 147 dst[x] = op(src1[x], src2[x]); 148 } 149 } 150 151 template<typename T, class Op, class Op32> 152 void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, 153 T* dst, size_t step, Size sz) 154 { 155 #if CV_SSE2 || CV_NEON 156 Op32 op32; 157 #endif 158 Op op; 159 160 for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), 161 src2 = (const T *)((const uchar *)src2 + step2), 162 dst = (T *)((uchar *)dst + step) ) 163 { 164 int x = 0; 165 166 #if CV_AVX2 167 if( USE_AVX2 ) 168 { 169 if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) 170 { 171 for( ; x <= sz.width - 8; x += 8 ) 172 { 173 typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x); 174 r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x)); 175 VLoadStore256Aligned<T>::store(dst + x, r0); 176 } 177 } 178 } 179 #elif CV_SSE2 180 if( USE_SSE2 ) 181 { 182 if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) 183 { 184 for( ; x <= sz.width - 8; x += 8 ) 185 { 186 typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x ); 187 typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4); 188 r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x )); 189 r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4)); 190 VLoadStore128Aligned<T>::store(dst + x , r0); 191 VLoadStore128Aligned<T>::store(dst + x + 4, r1); 192 } 193 } 194 } 195 #endif // CV_AVX2 196 197 #if CV_NEON || CV_SSE2 198 #if CV_AVX2 199 if( USE_AVX2 ) 200 { 201 for( ; x <= sz.width - 8; x += 8 ) 202 { 203 typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x); 204 r0 = op32(r0, VLoadStore256<T>::load(src2 + x)); 205 VLoadStore256<T>::store(dst + x, r0); 206 } 207 } 208 #else 209 #if CV_SSE2 210 if( USE_SSE2 ) 211 { 212 #endif // CV_SSE2 213 for( ; x <= sz.width - 8; x += 8 ) 214 { 215 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x ); 216 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4); 217 r0 = op32(r0, VLoadStore128<T>::load(src2 + x )); 218 r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4)); 219 VLoadStore128<T>::store(dst + x , r0); 220 VLoadStore128<T>::store(dst + x + 4, r1); 221 } 222 #if CV_SSE2 223 } 224 #endif // CV_SSE2 225 #endif // CV_AVX2 226 #endif // CV_NEON || CV_SSE2 227 228 #if CV_ENABLE_UNROLLED 229 for( ; x <= sz.width - 4; x += 4 ) 230 { 231 T v0 = op(src1[x], src2[x]); 232 T v1 = op(src1[x+1], src2[x+1]); 233 dst[x] = v0; dst[x+1] = v1; 234 v0 = op(src1[x+2], src2[x+2]); 235 v1 = op(src1[x+3], src2[x+3]); 236 dst[x+2] = v0; dst[x+3] = v1; 237 } 238 #endif 239 240 for( ; x < sz.width; x++ ) 241 dst[x] = op(src1[x], src2[x]); 242 } 243 } 244 245 246 template<typename T, class Op, class Op64> 247 void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, 248 T* dst, size_t step, Size sz) 249 { 250 #if CV_SSE2 251 Op64 op64; 252 #endif 253 Op op; 254 255 for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), 256 src2 = (const T *)((const uchar *)src2 + step2), 257 dst = (T *)((uchar *)dst + step) ) 258 { 259 int x = 0; 260 261 #if CV_AVX2 262 if( USE_AVX2 ) 263 { 264 if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) 265 { 266 for( ; x <= sz.width - 4; x += 4 ) 267 { 268 typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x); 269 r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x)); 270 VLoadStore256Aligned<T>::store(dst + x, r0); 271 } 272 } 273 } 274 #elif CV_SSE2 275 if( USE_SSE2 ) 276 { 277 if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) 278 { 279 for( ; x <= sz.width - 4; x += 4 ) 280 { 281 typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x ); 282 typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2); 283 r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x )); 284 r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2)); 285 VLoadStore128Aligned<T>::store(dst + x , r0); 286 VLoadStore128Aligned<T>::store(dst + x + 2, r1); 287 } 288 } 289 } 290 #endif 291 292 for( ; x <= sz.width - 4; x += 4 ) 293 { 294 T v0 = op(src1[x], src2[x]); 295 T v1 = op(src1[x+1], src2[x+1]); 296 dst[x] = v0; dst[x+1] = v1; 297 v0 = op(src1[x+2], src2[x+2]); 298 v1 = op(src1[x+3], src2[x+3]); 299 dst[x+2] = v0; dst[x+3] = v1; 300 } 301 302 for( ; x < sz.width; x++ ) 303 dst[x] = op(src1[x], src2[x]); 304 } 305 } 306 307 #if CV_AVX2 308 309 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ 310 template <> \ 311 struct name<template_arg>{ \ 312 typedef register_type reg_type; \ 313 static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ 314 static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ 315 } 316 317 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ 318 template <> \ 319 struct name<template_arg>{ \ 320 typedef register_type reg_type; \ 321 static reg_type load(const template_arg * p) { return load_body (p); } \ 322 static void store(template_arg * p, reg_type v) { store_body (p, v); } \ 323 } 324 325 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ 326 template<> \ 327 struct name<template_arg> \ 328 { \ 329 VLoadStore256<template_arg>::reg_type operator()( \ 330 const VLoadStore256<template_arg>::reg_type & a, \ 331 const VLoadStore256<template_arg>::reg_type & b) const \ 332 { \ 333 body; \ 334 } \ 335 } 336 337 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ 338 template<> \ 339 struct name<template_arg> \ 340 { \ 341 VLoadStore256<template_arg>::reg_type operator()( \ 342 const VLoadStore256<template_arg>::reg_type & a, \ 343 const VLoadStore256<template_arg>::reg_type & ) const \ 344 { \ 345 body; \ 346 } \ 347 } 348 349 FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 350 FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 351 FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 352 FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 353 FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); 354 FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); 355 FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); 356 357 FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); 358 FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); 359 FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); 360 361 FUNCTOR_TEMPLATE(VAdd); 362 FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); 363 FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); 364 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); 365 FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); 366 FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); 367 FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); 368 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); 369 370 FUNCTOR_TEMPLATE(VSub); 371 FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); 372 FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); 373 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); 374 FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); 375 FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); 376 FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); 377 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); 378 379 FUNCTOR_TEMPLATE(VMin); 380 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); 381 FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); 382 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b)); 383 FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); 384 FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); 385 FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); 386 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); 387 388 FUNCTOR_TEMPLATE(VMax); 389 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); 390 FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); 391 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); 392 FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); 393 FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); 394 FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); 395 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); 396 397 398 static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 399 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; 400 static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, 401 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; 402 403 FUNCTOR_TEMPLATE(VAbsDiff); 404 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, 405 return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); 406 ); 407 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, 408 __m256i d = _mm256_subs_epi8(a, b); 409 __m256i m = _mm256_cmpgt_epi8(b, a); 410 return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); 411 ); 412 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, 413 return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); 414 ); 415 FUNCTOR_CLOSURE_2arg(VAbsDiff, short, 416 __m256i M = _mm256_max_epi16(a, b); 417 __m256i m = _mm256_min_epi16(a, b); 418 return _mm256_subs_epi16(M, m); 419 ); 420 FUNCTOR_CLOSURE_2arg(VAbsDiff, int, 421 __m256i d = _mm256_sub_epi32(a, b); 422 __m256i m = _mm256_cmpgt_epi32(b, a); 423 return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); 424 ); 425 FUNCTOR_CLOSURE_2arg(VAbsDiff, float, 426 return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); 427 ); 428 FUNCTOR_CLOSURE_2arg(VAbsDiff, double, 429 return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); 430 ); 431 432 FUNCTOR_TEMPLATE(VAnd); 433 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); 434 FUNCTOR_TEMPLATE(VOr); 435 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); 436 FUNCTOR_TEMPLATE(VXor); 437 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); 438 FUNCTOR_TEMPLATE(VNot); 439 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); 440 441 #elif CV_SSE2 442 443 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ 444 template <> \ 445 struct name<template_arg>{ \ 446 typedef register_type reg_type; \ 447 static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ 448 static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ 449 } 450 451 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ 452 template <> \ 453 struct name<template_arg>{ \ 454 typedef register_type reg_type; \ 455 static reg_type load(const template_arg * p) { return load_body (p); } \ 456 static void store(template_arg * p, reg_type v) { store_body (p, v); } \ 457 } 458 459 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ 460 template<> \ 461 struct name<template_arg> \ 462 { \ 463 VLoadStore128<template_arg>::reg_type operator()( \ 464 const VLoadStore128<template_arg>::reg_type & a, \ 465 const VLoadStore128<template_arg>::reg_type & b) const \ 466 { \ 467 body; \ 468 } \ 469 } 470 471 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ 472 template<> \ 473 struct name<template_arg> \ 474 { \ 475 VLoadStore128<template_arg>::reg_type operator()( \ 476 const VLoadStore128<template_arg>::reg_type & a, \ 477 const VLoadStore128<template_arg>::reg_type & ) const \ 478 { \ 479 body; \ 480 } \ 481 } 482 483 FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128); 484 FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128); 485 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128); 486 FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128); 487 FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128); 488 FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps ); 489 FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd ); 490 491 FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); 492 FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); 493 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64); 494 FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64); 495 496 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128); 497 FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps ); 498 FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd ); 499 500 FUNCTOR_TEMPLATE(VAdd); 501 FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b)); 502 FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b)); 503 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b)); 504 FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b)); 505 FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b)); 506 FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b)); 507 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b)); 508 509 FUNCTOR_TEMPLATE(VSub); 510 FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b)); 511 FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b)); 512 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b)); 513 FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b)); 514 FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b)); 515 FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b)); 516 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b)); 517 518 FUNCTOR_TEMPLATE(VMin); 519 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b)); 520 FUNCTOR_CLOSURE_2arg(VMin, schar, 521 __m128i m = _mm_cmpgt_epi8(a, b); 522 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); 523 ); 524 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b))); 525 FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b)); 526 FUNCTOR_CLOSURE_2arg(VMin, int, 527 __m128i m = _mm_cmpgt_epi32(a, b); 528 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); 529 ); 530 FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b)); 531 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b)); 532 533 FUNCTOR_TEMPLATE(VMax); 534 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b)); 535 FUNCTOR_CLOSURE_2arg(VMax, schar, 536 __m128i m = _mm_cmpgt_epi8(b, a); 537 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); 538 ); 539 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b)); 540 FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b)); 541 FUNCTOR_CLOSURE_2arg(VMax, int, 542 __m128i m = _mm_cmpgt_epi32(b, a); 543 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); 544 ); 545 FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b)); 546 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b)); 547 548 549 static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; 550 static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; 551 552 FUNCTOR_TEMPLATE(VAbsDiff); 553 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, 554 return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); 555 ); 556 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, 557 __m128i d = _mm_subs_epi8(a, b); 558 __m128i m = _mm_cmpgt_epi8(b, a); 559 return _mm_subs_epi8(_mm_xor_si128(d, m), m); 560 ); 561 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, 562 return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); 563 ); 564 FUNCTOR_CLOSURE_2arg(VAbsDiff, short, 565 __m128i M = _mm_max_epi16(a, b); 566 __m128i m = _mm_min_epi16(a, b); 567 return _mm_subs_epi16(M, m); 568 ); 569 FUNCTOR_CLOSURE_2arg(VAbsDiff, int, 570 __m128i d = _mm_sub_epi32(a, b); 571 __m128i m = _mm_cmpgt_epi32(b, a); 572 return _mm_sub_epi32(_mm_xor_si128(d, m), m); 573 ); 574 FUNCTOR_CLOSURE_2arg(VAbsDiff, float, 575 return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask); 576 ); 577 FUNCTOR_CLOSURE_2arg(VAbsDiff, double, 578 return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); 579 ); 580 581 FUNCTOR_TEMPLATE(VAnd); 582 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b)); 583 FUNCTOR_TEMPLATE(VOr); 584 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b)); 585 FUNCTOR_TEMPLATE(VXor); 586 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b)); 587 FUNCTOR_TEMPLATE(VNot); 588 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a)); 589 #endif 590 591 #if CV_NEON 592 593 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ 594 template <> \ 595 struct name<template_arg>{ \ 596 typedef register_type reg_type; \ 597 static reg_type load(const template_arg * p) { return load_body (p);}; \ 598 static void store(template_arg * p, reg_type v) { store_body (p, v);}; \ 599 } 600 601 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ 602 template<> \ 603 struct name<template_arg> \ 604 { \ 605 VLoadStore128<template_arg>::reg_type operator()( \ 606 VLoadStore128<template_arg>::reg_type a, \ 607 VLoadStore128<template_arg>::reg_type b) const \ 608 { \ 609 return body; \ 610 }; \ 611 } 612 613 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ 614 template<> \ 615 struct name<template_arg> \ 616 { \ 617 VLoadStore128<template_arg>::reg_type operator()( \ 618 VLoadStore128<template_arg>::reg_type a, \ 619 VLoadStore128<template_arg>::reg_type ) const \ 620 { \ 621 return body; \ 622 }; \ 623 } 624 625 FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 ); 626 FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 ); 627 FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16); 628 FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16); 629 FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32); 630 FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32); 631 632 FUNCTOR_TEMPLATE(VAdd); 633 FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b)); 634 FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b)); 635 FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b)); 636 FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b)); 637 FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b)); 638 FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b)); 639 640 FUNCTOR_TEMPLATE(VSub); 641 FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b)); 642 FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b)); 643 FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b)); 644 FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b)); 645 FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b)); 646 FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b)); 647 648 FUNCTOR_TEMPLATE(VMin); 649 FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b)); 650 FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b)); 651 FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b)); 652 FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b)); 653 FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b)); 654 FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b)); 655 656 FUNCTOR_TEMPLATE(VMax); 657 FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b)); 658 FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b)); 659 FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b)); 660 FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b)); 661 FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b)); 662 FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b)); 663 664 FUNCTOR_TEMPLATE(VAbsDiff); 665 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b)); 666 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b))); 667 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b)); 668 FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b))); 669 FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b)); 670 FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b)); 671 672 FUNCTOR_TEMPLATE(VAnd); 673 FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b)); 674 FUNCTOR_TEMPLATE(VOr); 675 FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b)); 676 FUNCTOR_TEMPLATE(VXor); 677 FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b)); 678 FUNCTOR_TEMPLATE(VNot); 679 FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a )); 680 #endif 681 682 #if CV_SSE2 || CV_NEON 683 #define IF_SIMD(op) op 684 #else 685 #define IF_SIMD(op) NOP 686 #endif 687 688 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const 689 { return CV_FAST_CAST_8U(a + b); } 690 template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const 691 { return CV_FAST_CAST_8U(a - b); } 692 693 template<typename T> struct OpAbsDiff 694 { 695 typedef T type1; 696 typedef T type2; 697 typedef T rtype; 698 T operator()(T a, T b) const { return (T)std::abs(a - b); } 699 }; 700 701 template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const 702 { return saturate_cast<short>(std::abs(a - b)); } 703 704 template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const 705 { return saturate_cast<schar>(std::abs(a - b)); } 706 707 template<typename T, typename WT=T> struct OpAbsDiffS 708 { 709 typedef T type1; 710 typedef WT type2; 711 typedef T rtype; 712 T operator()(T a, WT b) const { return saturate_cast<T>(std::abs(a - b)); } 713 }; 714 715 template<typename T> struct OpAnd 716 { 717 typedef T type1; 718 typedef T type2; 719 typedef T rtype; 720 T operator()( T a, T b ) const { return a & b; } 721 }; 722 723 template<typename T> struct OpOr 724 { 725 typedef T type1; 726 typedef T type2; 727 typedef T rtype; 728 T operator()( T a, T b ) const { return a | b; } 729 }; 730 731 template<typename T> struct OpXor 732 { 733 typedef T type1; 734 typedef T type2; 735 typedef T rtype; 736 T operator()( T a, T b ) const { return a ^ b; } 737 }; 738 739 template<typename T> struct OpNot 740 { 741 typedef T type1; 742 typedef T type2; 743 typedef T rtype; 744 T operator()( T a, T ) const { return ~a; } 745 }; 746 747 #if (ARITHM_USE_IPP == 1) 748 static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step) 749 { 750 if( sz.height == 1 ) 751 step1 = step2 = step = sz.width*elemSize; 752 } 753 #endif 754 755 static void add8u( const uchar* src1, size_t step1, 756 const uchar* src2, size_t step2, 757 uchar* dst, size_t step, Size sz, void* ) 758 { 759 #if (ARITHM_USE_IPP == 1) 760 CV_IPP_CHECK() 761 { 762 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 763 if (0 <= ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) 764 { 765 CV_IMPL_ADD(CV_IMPL_IPP); 766 return; 767 } 768 setIppErrorStatus(); 769 } 770 #endif 771 (vBinOp<uchar, OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, sz)); 772 } 773 774 static void add8s( const schar* src1, size_t step1, 775 const schar* src2, size_t step2, 776 schar* dst, size_t step, Size sz, void* ) 777 { 778 vBinOp<schar, OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, sz); 779 } 780 781 static void add16u( const ushort* src1, size_t step1, 782 const ushort* src2, size_t step2, 783 ushort* dst, size_t step, Size sz, void* ) 784 { 785 #if (ARITHM_USE_IPP == 1) 786 CV_IPP_CHECK() 787 { 788 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 789 if (0 <= ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) 790 { 791 CV_IMPL_ADD(CV_IMPL_IPP); 792 return; 793 } 794 setIppErrorStatus(); 795 } 796 #endif 797 (vBinOp<ushort, OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, sz)); 798 } 799 800 static void add16s( const short* src1, size_t step1, 801 const short* src2, size_t step2, 802 short* dst, size_t step, Size sz, void* ) 803 { 804 #if (ARITHM_USE_IPP == 1) 805 CV_IPP_CHECK() 806 { 807 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 808 if (0 <= ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) 809 { 810 CV_IMPL_ADD(CV_IMPL_IPP); 811 return; 812 } 813 setIppErrorStatus(); 814 } 815 #endif 816 (vBinOp<short, OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, sz)); 817 } 818 819 static void add32s( const int* src1, size_t step1, 820 const int* src2, size_t step2, 821 int* dst, size_t step, Size sz, void* ) 822 { 823 vBinOp32<int, OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, sz); 824 } 825 826 static void add32f( const float* src1, size_t step1, 827 const float* src2, size_t step2, 828 float* dst, size_t step, Size sz, void* ) 829 { 830 #if (ARITHM_USE_IPP == 1) 831 CV_IPP_CHECK() 832 { 833 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 834 if (0 <= ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) 835 { 836 CV_IMPL_ADD(CV_IMPL_IPP); 837 return; 838 } 839 setIppErrorStatus(); 840 } 841 #endif 842 (vBinOp32<float, OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, sz)); 843 } 844 845 static void add64f( const double* src1, size_t step1, 846 const double* src2, size_t step2, 847 double* dst, size_t step, Size sz, void* ) 848 { 849 vBinOp64<double, OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, sz); 850 } 851 852 static void sub8u( const uchar* src1, size_t step1, 853 const uchar* src2, size_t step2, 854 uchar* dst, size_t step, Size sz, void* ) 855 { 856 #if (ARITHM_USE_IPP == 1) 857 CV_IPP_CHECK() 858 { 859 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 860 if (0 <= ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) 861 { 862 CV_IMPL_ADD(CV_IMPL_IPP); 863 return; 864 } 865 setIppErrorStatus(); 866 } 867 #endif 868 (vBinOp<uchar, OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, sz)); 869 } 870 871 static void sub8s( const schar* src1, size_t step1, 872 const schar* src2, size_t step2, 873 schar* dst, size_t step, Size sz, void* ) 874 { 875 vBinOp<schar, OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, sz); 876 } 877 878 static void sub16u( const ushort* src1, size_t step1, 879 const ushort* src2, size_t step2, 880 ushort* dst, size_t step, Size sz, void* ) 881 { 882 #if (ARITHM_USE_IPP == 1) 883 CV_IPP_CHECK() 884 { 885 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 886 if (0 <= ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) 887 { 888 CV_IMPL_ADD(CV_IMPL_IPP); 889 return; 890 } 891 setIppErrorStatus(); 892 } 893 #endif 894 (vBinOp<ushort, OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, sz)); 895 } 896 897 static void sub16s( const short* src1, size_t step1, 898 const short* src2, size_t step2, 899 short* dst, size_t step, Size sz, void* ) 900 { 901 #if (ARITHM_USE_IPP == 1) 902 CV_IPP_CHECK() 903 { 904 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 905 if (0 <= ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) 906 { 907 CV_IMPL_ADD(CV_IMPL_IPP); 908 return; 909 } 910 setIppErrorStatus(); 911 } 912 #endif 913 (vBinOp<short, OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, sz)); 914 } 915 916 static void sub32s( const int* src1, size_t step1, 917 const int* src2, size_t step2, 918 int* dst, size_t step, Size sz, void* ) 919 { 920 vBinOp32<int, OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, sz); 921 } 922 923 static void sub32f( const float* src1, size_t step1, 924 const float* src2, size_t step2, 925 float* dst, size_t step, Size sz, void* ) 926 { 927 #if (ARITHM_USE_IPP == 1) 928 CV_IPP_CHECK() 929 { 930 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 931 if (0 <= ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz))) 932 { 933 CV_IMPL_ADD(CV_IMPL_IPP); 934 return; 935 } 936 setIppErrorStatus(); 937 } 938 #endif 939 (vBinOp32<float, OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, sz)); 940 } 941 942 static void sub64f( const double* src1, size_t step1, 943 const double* src2, size_t step2, 944 double* dst, size_t step, Size sz, void* ) 945 { 946 vBinOp64<double, OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, sz); 947 } 948 949 template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); } 950 template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); } 951 952 static void max8u( const uchar* src1, size_t step1, 953 const uchar* src2, size_t step2, 954 uchar* dst, size_t step, Size sz, void* ) 955 { 956 #if (ARITHM_USE_IPP == 1) 957 CV_IPP_CHECK() 958 { 959 uchar* s1 = (uchar*)src1; 960 uchar* s2 = (uchar*)src2; 961 uchar* d = dst; 962 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 963 int i = 0; 964 for(; i < sz.height; i++) 965 { 966 if (0 > ippsMaxEvery_8u(s1, s2, d, sz.width)) 967 break; 968 s1 += step1; 969 s2 += step2; 970 d += step; 971 } 972 if (i == sz.height) 973 { 974 CV_IMPL_ADD(CV_IMPL_IPP); 975 return; 976 } 977 setIppErrorStatus(); 978 } 979 #endif 980 vBinOp<uchar, OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, sz); 981 } 982 983 static void max8s( const schar* src1, size_t step1, 984 const schar* src2, size_t step2, 985 schar* dst, size_t step, Size sz, void* ) 986 { 987 vBinOp<schar, OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, sz); 988 } 989 990 static void max16u( const ushort* src1, size_t step1, 991 const ushort* src2, size_t step2, 992 ushort* dst, size_t step, Size sz, void* ) 993 { 994 #if (ARITHM_USE_IPP == 1) 995 CV_IPP_CHECK() 996 { 997 ushort* s1 = (ushort*)src1; 998 ushort* s2 = (ushort*)src2; 999 ushort* d = dst; 1000 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1001 int i = 0; 1002 for(; i < sz.height; i++) 1003 { 1004 if (0 > ippsMaxEvery_16u(s1, s2, d, sz.width)) 1005 break; 1006 s1 = (ushort*)((uchar*)s1 + step1); 1007 s2 = (ushort*)((uchar*)s2 + step2); 1008 d = (ushort*)((uchar*)d + step); 1009 } 1010 if (i == sz.height) 1011 { 1012 CV_IMPL_ADD(CV_IMPL_IPP); 1013 return; 1014 } 1015 setIppErrorStatus(); 1016 } 1017 #endif 1018 vBinOp<ushort, OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, sz); 1019 } 1020 1021 static void max16s( const short* src1, size_t step1, 1022 const short* src2, size_t step2, 1023 short* dst, size_t step, Size sz, void* ) 1024 { 1025 vBinOp<short, OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, sz); 1026 } 1027 1028 static void max32s( const int* src1, size_t step1, 1029 const int* src2, size_t step2, 1030 int* dst, size_t step, Size sz, void* ) 1031 { 1032 vBinOp32<int, OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, sz); 1033 } 1034 1035 static void max32f( const float* src1, size_t step1, 1036 const float* src2, size_t step2, 1037 float* dst, size_t step, Size sz, void* ) 1038 { 1039 #if (ARITHM_USE_IPP == 1) 1040 CV_IPP_CHECK() 1041 { 1042 float* s1 = (float*)src1; 1043 float* s2 = (float*)src2; 1044 float* d = dst; 1045 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1046 int i = 0; 1047 for(; i < sz.height; i++) 1048 { 1049 if (0 > ippsMaxEvery_32f(s1, s2, d, sz.width)) 1050 break; 1051 s1 = (float*)((uchar*)s1 + step1); 1052 s2 = (float*)((uchar*)s2 + step2); 1053 d = (float*)((uchar*)d + step); 1054 } 1055 if (i == sz.height) 1056 { 1057 CV_IMPL_ADD(CV_IMPL_IPP); 1058 return; 1059 } 1060 setIppErrorStatus(); 1061 } 1062 #endif 1063 vBinOp32<float, OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, sz); 1064 } 1065 1066 static void max64f( const double* src1, size_t step1, 1067 const double* src2, size_t step2, 1068 double* dst, size_t step, Size sz, void* ) 1069 { 1070 #if ARITHM_USE_IPP == 1 1071 CV_IPP_CHECK() 1072 { 1073 double* s1 = (double*)src1; 1074 double* s2 = (double*)src2; 1075 double* d = dst; 1076 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1077 int i = 0; 1078 for(; i < sz.height; i++) 1079 { 1080 if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width)) 1081 break; 1082 s1 = (double*)((uchar*)s1 + step1); 1083 s2 = (double*)((uchar*)s2 + step2); 1084 d = (double*)((uchar*)d + step); 1085 } 1086 if (i == sz.height) 1087 { 1088 CV_IMPL_ADD(CV_IMPL_IPP); 1089 return; 1090 } 1091 setIppErrorStatus(); 1092 } 1093 #endif 1094 vBinOp64<double, OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, sz); 1095 } 1096 1097 static void min8u( const uchar* src1, size_t step1, 1098 const uchar* src2, size_t step2, 1099 uchar* dst, size_t step, Size sz, void* ) 1100 { 1101 #if (ARITHM_USE_IPP == 1) 1102 CV_IPP_CHECK() 1103 { 1104 uchar* s1 = (uchar*)src1; 1105 uchar* s2 = (uchar*)src2; 1106 uchar* d = dst; 1107 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1108 int i = 0; 1109 for(; i < sz.height; i++) 1110 { 1111 if (0 > ippsMinEvery_8u(s1, s2, d, sz.width)) 1112 break; 1113 s1 += step1; 1114 s2 += step2; 1115 d += step; 1116 } 1117 if (i == sz.height) 1118 { 1119 CV_IMPL_ADD(CV_IMPL_IPP); 1120 return; 1121 } 1122 setIppErrorStatus(); 1123 } 1124 #endif 1125 vBinOp<uchar, OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, sz); 1126 } 1127 1128 static void min8s( const schar* src1, size_t step1, 1129 const schar* src2, size_t step2, 1130 schar* dst, size_t step, Size sz, void* ) 1131 { 1132 vBinOp<schar, OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, sz); 1133 } 1134 1135 static void min16u( const ushort* src1, size_t step1, 1136 const ushort* src2, size_t step2, 1137 ushort* dst, size_t step, Size sz, void* ) 1138 { 1139 #if (ARITHM_USE_IPP == 1) 1140 CV_IPP_CHECK() 1141 { 1142 ushort* s1 = (ushort*)src1; 1143 ushort* s2 = (ushort*)src2; 1144 ushort* d = dst; 1145 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1146 int i = 0; 1147 for(; i < sz.height; i++) 1148 { 1149 if (0 > ippsMinEvery_16u(s1, s2, d, sz.width)) 1150 break; 1151 s1 = (ushort*)((uchar*)s1 + step1); 1152 s2 = (ushort*)((uchar*)s2 + step2); 1153 d = (ushort*)((uchar*)d + step); 1154 } 1155 if (i == sz.height) 1156 { 1157 CV_IMPL_ADD(CV_IMPL_IPP); 1158 return; 1159 } 1160 setIppErrorStatus(); 1161 } 1162 #endif 1163 vBinOp<ushort, OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, sz); 1164 } 1165 1166 static void min16s( const short* src1, size_t step1, 1167 const short* src2, size_t step2, 1168 short* dst, size_t step, Size sz, void* ) 1169 { 1170 vBinOp<short, OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, sz); 1171 } 1172 1173 static void min32s( const int* src1, size_t step1, 1174 const int* src2, size_t step2, 1175 int* dst, size_t step, Size sz, void* ) 1176 { 1177 vBinOp32<int, OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, sz); 1178 } 1179 1180 static void min32f( const float* src1, size_t step1, 1181 const float* src2, size_t step2, 1182 float* dst, size_t step, Size sz, void* ) 1183 { 1184 #if (ARITHM_USE_IPP == 1) 1185 CV_IPP_CHECK() 1186 { 1187 float* s1 = (float*)src1; 1188 float* s2 = (float*)src2; 1189 float* d = dst; 1190 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1191 int i = 0; 1192 for(; i < sz.height; i++) 1193 { 1194 if (0 > ippsMinEvery_32f(s1, s2, d, sz.width)) 1195 break; 1196 s1 = (float*)((uchar*)s1 + step1); 1197 s2 = (float*)((uchar*)s2 + step2); 1198 d = (float*)((uchar*)d + step); 1199 } 1200 if (i == sz.height) 1201 { 1202 CV_IMPL_ADD(CV_IMPL_IPP); 1203 return; 1204 } 1205 setIppErrorStatus(); 1206 } 1207 #endif 1208 vBinOp32<float, OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, sz); 1209 } 1210 1211 static void min64f( const double* src1, size_t step1, 1212 const double* src2, size_t step2, 1213 double* dst, size_t step, Size sz, void* ) 1214 { 1215 #if ARITHM_USE_IPP == 1 1216 CV_IPP_CHECK() 1217 { 1218 double* s1 = (double*)src1; 1219 double* s2 = (double*)src2; 1220 double* d = dst; 1221 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1222 int i = 0; 1223 for(; i < sz.height; i++) 1224 { 1225 if (0 > ippsMinEvery_64f(s1, s2, d, sz.width)) 1226 break; 1227 s1 = (double*)((uchar*)s1 + step1); 1228 s2 = (double*)((uchar*)s2 + step2); 1229 d = (double*)((uchar*)d + step); 1230 } 1231 if (i == sz.height) 1232 { 1233 CV_IMPL_ADD(CV_IMPL_IPP); 1234 return; 1235 } 1236 setIppErrorStatus(); 1237 } 1238 #endif 1239 vBinOp64<double, OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, sz); 1240 } 1241 1242 static void absdiff8u( const uchar* src1, size_t step1, 1243 const uchar* src2, size_t step2, 1244 uchar* dst, size_t step, Size sz, void* ) 1245 { 1246 #if (ARITHM_USE_IPP == 1) 1247 CV_IPP_CHECK() 1248 { 1249 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1250 if (0 <= ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) 1251 { 1252 CV_IMPL_ADD(CV_IMPL_IPP); 1253 return; 1254 } 1255 setIppErrorStatus(); 1256 } 1257 #endif 1258 (vBinOp<uchar, OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, sz)); 1259 } 1260 1261 static void absdiff8s( const schar* src1, size_t step1, 1262 const schar* src2, size_t step2, 1263 schar* dst, size_t step, Size sz, void* ) 1264 { 1265 vBinOp<schar, OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, sz); 1266 } 1267 1268 static void absdiff16u( const ushort* src1, size_t step1, 1269 const ushort* src2, size_t step2, 1270 ushort* dst, size_t step, Size sz, void* ) 1271 { 1272 #if (ARITHM_USE_IPP == 1) 1273 CV_IPP_CHECK() 1274 { 1275 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1276 if (0 <= ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) 1277 { 1278 CV_IMPL_ADD(CV_IMPL_IPP); 1279 return; 1280 } 1281 setIppErrorStatus(); 1282 } 1283 #endif 1284 (vBinOp<ushort, OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, sz)); 1285 } 1286 1287 static void absdiff16s( const short* src1, size_t step1, 1288 const short* src2, size_t step2, 1289 short* dst, size_t step, Size sz, void* ) 1290 { 1291 vBinOp<short, OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, sz); 1292 } 1293 1294 static void absdiff32s( const int* src1, size_t step1, 1295 const int* src2, size_t step2, 1296 int* dst, size_t step, Size sz, void* ) 1297 { 1298 vBinOp32<int, OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, sz); 1299 } 1300 1301 static void absdiff32f( const float* src1, size_t step1, 1302 const float* src2, size_t step2, 1303 float* dst, size_t step, Size sz, void* ) 1304 { 1305 #if (ARITHM_USE_IPP == 1) 1306 CV_IPP_CHECK() 1307 { 1308 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1309 if (0 <= ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) 1310 { 1311 CV_IMPL_ADD(CV_IMPL_IPP); 1312 return; 1313 } 1314 setIppErrorStatus(); 1315 } 1316 #endif 1317 (vBinOp32<float, OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, sz)); 1318 } 1319 1320 static void absdiff64f( const double* src1, size_t step1, 1321 const double* src2, size_t step2, 1322 double* dst, size_t step, Size sz, void* ) 1323 { 1324 vBinOp64<double, OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, sz); 1325 } 1326 1327 1328 static void and8u( const uchar* src1, size_t step1, 1329 const uchar* src2, size_t step2, 1330 uchar* dst, size_t step, Size sz, void* ) 1331 { 1332 #if (ARITHM_USE_IPP == 1) 1333 CV_IPP_CHECK() 1334 { 1335 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1336 if (0 <= ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) 1337 { 1338 CV_IMPL_ADD(CV_IMPL_IPP); 1339 return; 1340 } 1341 setIppErrorStatus(); 1342 } 1343 #endif 1344 (vBinOp<uchar, OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, sz)); 1345 } 1346 1347 static void or8u( const uchar* src1, size_t step1, 1348 const uchar* src2, size_t step2, 1349 uchar* dst, size_t step, Size sz, void* ) 1350 { 1351 #if (ARITHM_USE_IPP == 1) 1352 CV_IPP_CHECK() 1353 { 1354 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1355 if (0 <= ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) 1356 { 1357 CV_IMPL_ADD(CV_IMPL_IPP); 1358 return; 1359 } 1360 setIppErrorStatus(); 1361 } 1362 #endif 1363 (vBinOp<uchar, OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, sz)); 1364 } 1365 1366 static void xor8u( const uchar* src1, size_t step1, 1367 const uchar* src2, size_t step2, 1368 uchar* dst, size_t step, Size sz, void* ) 1369 { 1370 #if (ARITHM_USE_IPP == 1) 1371 CV_IPP_CHECK() 1372 { 1373 fixSteps(sz, sizeof(dst[0]), step1, step2, step); 1374 if (0 <= ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) 1375 { 1376 CV_IMPL_ADD(CV_IMPL_IPP); 1377 return; 1378 } 1379 setIppErrorStatus(); 1380 } 1381 #endif 1382 (vBinOp<uchar, OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, sz)); 1383 } 1384 1385 static void not8u( const uchar* src1, size_t step1, 1386 const uchar* src2, size_t step2, 1387 uchar* dst, size_t step, Size sz, void* ) 1388 { 1389 #if (ARITHM_USE_IPP == 1) 1390 CV_IPP_CHECK() 1391 { 1392 fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void)src2; 1393 if (0 <= ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, ippiSize(sz))) 1394 { 1395 CV_IMPL_ADD(CV_IMPL_IPP); 1396 return; 1397 } 1398 setIppErrorStatus(); 1399 } 1400 #endif 1401 (vBinOp<uchar, OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, sz)); 1402 } 1403 1404 /****************************************************************************************\ 1405 * logical operations * 1406 \****************************************************************************************/ 1407 1408 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize ) 1409 { 1410 int scn = (int)sc.total(), cn = CV_MAT_CN(buftype); 1411 size_t esz = CV_ELEM_SIZE(buftype); 1412 getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0); 1413 // unroll the scalar 1414 if( scn < cn ) 1415 { 1416 CV_Assert( scn == 1 ); 1417 size_t esz1 = CV_ELEM_SIZE1(buftype); 1418 for( size_t i = esz1; i < esz; i++ ) 1419 scbuf[i] = scbuf[i - esz1]; 1420 } 1421 for( size_t i = esz; i < blocksize*esz; i++ ) 1422 scbuf[i] = scbuf[i - esz]; 1423 } 1424 1425 1426 enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4, 1427 OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8, 1428 OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14, 1429 OCL_OP_RDIV_SCALE=15 }; 1430 1431 #ifdef HAVE_OPENCL 1432 1433 static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF", 1434 "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE", 1435 "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 }; 1436 1437 static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, 1438 InputArray _mask, bool bitwise, int oclop, bool haveScalar ) 1439 { 1440 bool haveMask = !_mask.empty(); 1441 int srctype = _src1.type(); 1442 int srcdepth = CV_MAT_DEPTH(srctype); 1443 int cn = CV_MAT_CN(srctype); 1444 1445 const ocl::Device d = ocl::Device::getDefault(); 1446 bool doubleSupport = d.doubleFPConfig() > 0; 1447 if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) || 1448 (!doubleSupport && srcdepth == CV_64F && !bitwise)) 1449 return false; 1450 1451 char opts[1024]; 1452 int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); 1453 int scalarcn = kercn == 3 ? 4 : kercn; 1454 int rowsPerWI = d.isIntel() ? 4 : 1; 1455 1456 sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d", 1457 haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop], 1458 bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : 1459 ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", 1460 bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) : 1461 ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)), 1462 bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) : 1463 ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)), 1464 kercn, rowsPerWI); 1465 1466 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); 1467 if (k.empty()) 1468 return false; 1469 1470 UMat src1 = _src1.getUMat(), src2; 1471 UMat dst = _dst.getUMat(), mask = _mask.getUMat(); 1472 1473 ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn); 1474 ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) : 1475 ocl::KernelArg::WriteOnly(dst, cn, kercn); 1476 ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); 1477 1478 if( haveScalar ) 1479 { 1480 size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn; 1481 double buf[4] = {0,0,0,0}; 1482 1483 if( oclop != OCL_OP_NOT ) 1484 { 1485 Mat src2sc = _src2.getMat(); 1486 convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1); 1487 } 1488 1489 ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); 1490 1491 if( !haveMask ) 1492 k.args(src1arg, dstarg, scalararg); 1493 else 1494 k.args(src1arg, maskarg, dstarg, scalararg); 1495 } 1496 else 1497 { 1498 src2 = _src2.getUMat(); 1499 ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn); 1500 1501 if( !haveMask ) 1502 k.args(src1arg, src2arg, dstarg); 1503 else 1504 k.args(src1arg, src2arg, maskarg, dstarg); 1505 } 1506 1507 size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI }; 1508 return k.run(2, globalsize, 0, false); 1509 } 1510 1511 #endif 1512 1513 static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, 1514 InputArray _mask, const BinaryFunc* tab, 1515 bool bitwise, int oclop ) 1516 { 1517 const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; 1518 int kind1 = psrc1->kind(), kind2 = psrc2->kind(); 1519 int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); 1520 int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2); 1521 int dims1 = psrc1->dims(), dims2 = psrc2->dims(); 1522 Size sz1 = dims1 <= 2 ? psrc1->size() : Size(); 1523 Size sz2 = dims2 <= 2 ? psrc2->size() : Size(); 1524 #ifdef HAVE_OPENCL 1525 bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) && 1526 dims1 <= 2 && dims2 <= 2; 1527 #endif 1528 bool haveMask = !_mask.empty(), haveScalar = false; 1529 BinaryFunc func; 1530 1531 if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask ) 1532 { 1533 _dst.create(sz1, type1); 1534 CV_OCL_RUN(use_opencl, 1535 ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false)) 1536 1537 if( bitwise ) 1538 { 1539 func = *tab; 1540 cn = (int)CV_ELEM_SIZE(type1); 1541 } 1542 else 1543 func = tab[depth1]; 1544 1545 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); 1546 Size sz = getContinuousSize(src1, src2, dst); 1547 size_t len = sz.width*(size_t)cn; 1548 if( len == (size_t)(int)len ) 1549 { 1550 sz.width = (int)len; 1551 func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0); 1552 return; 1553 } 1554 } 1555 1556 if( oclop == OCL_OP_NOT ) 1557 haveScalar = true; 1558 else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 || 1559 !psrc1->sameSize(*psrc2) || type1 != type2 ) 1560 { 1561 if( checkScalar(*psrc1, type2, kind1, kind2) ) 1562 { 1563 // src1 is a scalar; swap it with src2 1564 swap(psrc1, psrc2); 1565 swap(type1, type2); 1566 swap(depth1, depth2); 1567 swap(cn, cn2); 1568 swap(sz1, sz2); 1569 } 1570 else if( !checkScalar(*psrc2, type1, kind2, kind1) ) 1571 CV_Error( CV_StsUnmatchedSizes, 1572 "The operation is neither 'array op array' (where arrays have the same size and type), " 1573 "nor 'array op scalar', nor 'scalar op array'" ); 1574 haveScalar = true; 1575 } 1576 else 1577 { 1578 CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 ); 1579 } 1580 1581 size_t esz = CV_ELEM_SIZE(type1); 1582 size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz; 1583 BinaryFunc copymask = 0; 1584 bool reallocate = false; 1585 1586 if( haveMask ) 1587 { 1588 int mtype = _mask.type(); 1589 CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1)); 1590 copymask = getCopyMaskFunc(esz); 1591 reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1; 1592 } 1593 1594 AutoBuffer<uchar> _buf; 1595 uchar *scbuf = 0, *maskbuf = 0; 1596 1597 _dst.createSameSize(*psrc1, type1); 1598 // if this is mask operation and dst has been reallocated, 1599 // we have to clear the destination 1600 if( haveMask && reallocate ) 1601 _dst.setTo(0.); 1602 1603 CV_OCL_RUN(use_opencl, 1604 ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar)) 1605 1606 1607 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(); 1608 Mat dst = _dst.getMat(), mask = _mask.getMat(); 1609 1610 if( bitwise ) 1611 { 1612 func = *tab; 1613 cn = (int)esz; 1614 } 1615 else 1616 func = tab[depth1]; 1617 1618 if( !haveScalar ) 1619 { 1620 const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 }; 1621 uchar* ptrs[4]; 1622 1623 NAryMatIterator it(arrays, ptrs); 1624 size_t total = it.size, blocksize = total; 1625 1626 if( blocksize*cn > INT_MAX ) 1627 blocksize = INT_MAX/cn; 1628 1629 if( haveMask ) 1630 { 1631 blocksize = std::min(blocksize, blocksize0); 1632 _buf.allocate(blocksize*esz); 1633 maskbuf = _buf; 1634 } 1635 1636 for( size_t i = 0; i < it.nplanes; i++, ++it ) 1637 { 1638 for( size_t j = 0; j < total; j += blocksize ) 1639 { 1640 int bsz = (int)MIN(total - j, blocksize); 1641 1642 func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 ); 1643 if( haveMask ) 1644 { 1645 copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz ); 1646 ptrs[3] += bsz; 1647 } 1648 1649 bsz *= (int)esz; 1650 ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz; 1651 } 1652 } 1653 } 1654 else 1655 { 1656 const Mat* arrays[] = { &src1, &dst, &mask, 0 }; 1657 uchar* ptrs[3]; 1658 1659 NAryMatIterator it(arrays, ptrs); 1660 size_t total = it.size, blocksize = std::min(total, blocksize0); 1661 1662 _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32); 1663 scbuf = _buf; 1664 maskbuf = alignPtr(scbuf + blocksize*esz, 16); 1665 1666 convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize); 1667 1668 for( size_t i = 0; i < it.nplanes; i++, ++it ) 1669 { 1670 for( size_t j = 0; j < total; j += blocksize ) 1671 { 1672 int bsz = (int)MIN(total - j, blocksize); 1673 1674 func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 ); 1675 if( haveMask ) 1676 { 1677 copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz ); 1678 ptrs[2] += bsz; 1679 } 1680 1681 bsz *= (int)esz; 1682 ptrs[0] += bsz; ptrs[1] += bsz; 1683 } 1684 } 1685 } 1686 } 1687 1688 static BinaryFunc* getMaxTab() 1689 { 1690 static BinaryFunc maxTab[] = 1691 { 1692 (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s), 1693 (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s), 1694 (BinaryFunc)GET_OPTIMIZED(max32s), 1695 (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f, 1696 0 1697 }; 1698 1699 return maxTab; 1700 } 1701 1702 static BinaryFunc* getMinTab() 1703 { 1704 static BinaryFunc minTab[] = 1705 { 1706 (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s), 1707 (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s), 1708 (BinaryFunc)GET_OPTIMIZED(min32s), 1709 (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f, 1710 0 1711 }; 1712 1713 return minTab; 1714 } 1715 1716 } 1717 1718 void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask) 1719 { 1720 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u); 1721 binary_op(a, b, c, mask, &f, true, OCL_OP_AND); 1722 } 1723 1724 void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask) 1725 { 1726 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u); 1727 binary_op(a, b, c, mask, &f, true, OCL_OP_OR); 1728 } 1729 1730 void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask) 1731 { 1732 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u); 1733 binary_op(a, b, c, mask, &f, true, OCL_OP_XOR); 1734 } 1735 1736 void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask) 1737 { 1738 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u); 1739 binary_op(a, a, c, mask, &f, true, OCL_OP_NOT); 1740 } 1741 1742 void cv::max( InputArray src1, InputArray src2, OutputArray dst ) 1743 { 1744 binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); 1745 } 1746 1747 void cv::min( InputArray src1, InputArray src2, OutputArray dst ) 1748 { 1749 binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN ); 1750 } 1751 1752 void cv::max(const Mat& src1, const Mat& src2, Mat& dst) 1753 { 1754 OutputArray _dst(dst); 1755 binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); 1756 } 1757 1758 void cv::min(const Mat& src1, const Mat& src2, Mat& dst) 1759 { 1760 OutputArray _dst(dst); 1761 binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); 1762 } 1763 1764 void cv::max(const UMat& src1, const UMat& src2, UMat& dst) 1765 { 1766 OutputArray _dst(dst); 1767 binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); 1768 } 1769 1770 void cv::min(const UMat& src1, const UMat& src2, UMat& dst) 1771 { 1772 OutputArray _dst(dst); 1773 binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); 1774 } 1775 1776 1777 /****************************************************************************************\ 1778 * add/subtract * 1779 \****************************************************************************************/ 1780 1781 namespace cv 1782 { 1783 1784 static int actualScalarDepth(const double* data, int len) 1785 { 1786 int i = 0, minval = INT_MAX, maxval = INT_MIN; 1787 for(; i < len; ++i) 1788 { 1789 int ival = cvRound(data[i]); 1790 if( ival != data[i] ) 1791 break; 1792 minval = MIN(minval, ival); 1793 maxval = MAX(maxval, ival); 1794 } 1795 return i < len ? CV_64F : 1796 minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U : 1797 minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S : 1798 minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U : 1799 minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S : 1800 CV_32S; 1801 } 1802 1803 #ifdef HAVE_OPENCL 1804 1805 static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, 1806 InputArray _mask, int wtype, 1807 void* usrdata, int oclop, 1808 bool haveScalar ) 1809 { 1810 const ocl::Device d = ocl::Device::getDefault(); 1811 bool doubleSupport = d.doubleFPConfig() > 0; 1812 int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); 1813 bool haveMask = !_mask.empty(); 1814 1815 if ( (haveMask || haveScalar) && cn > 4 ) 1816 return false; 1817 1818 int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype)); 1819 if (!doubleSupport) 1820 wdepth = std::min(wdepth, CV_32F); 1821 1822 wtype = CV_MAKETYPE(wdepth, cn); 1823 int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2); 1824 if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F)) 1825 return false; 1826 1827 int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); 1828 int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1; 1829 1830 char cvtstr[4][32], opts[1024]; 1831 sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " 1832 "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s " 1833 "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s", 1834 (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), 1835 oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), 1836 ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), 1837 ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)), 1838 ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)), 1839 ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)), 1840 ocl::typeToStr(wdepth), wdepth, 1841 ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), 1842 ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), 1843 ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), 1844 doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI, 1845 oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ? 1846 ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert"); 1847 1848 size_t usrdata_esz = CV_ELEM_SIZE(wdepth); 1849 const uchar* usrdata_p = (const uchar*)usrdata; 1850 const double* usrdata_d = (const double*)usrdata; 1851 float usrdata_f[3]; 1852 int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE || 1853 oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0; 1854 if( n > 0 && wdepth == CV_32F ) 1855 { 1856 for( i = 0; i < n; i++ ) 1857 usrdata_f[i] = (float)usrdata_d[i]; 1858 usrdata_p = (const uchar*)usrdata_f; 1859 } 1860 1861 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); 1862 if (k.empty()) 1863 return false; 1864 1865 UMat src1 = _src1.getUMat(), src2; 1866 UMat dst = _dst.getUMat(), mask = _mask.getUMat(); 1867 1868 ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn); 1869 ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) : 1870 ocl::KernelArg::WriteOnly(dst, cn, kercn); 1871 ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); 1872 1873 if( haveScalar ) 1874 { 1875 size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn; 1876 double buf[4]={0,0,0,0}; 1877 Mat src2sc = _src2.getMat(); 1878 1879 if( !src2sc.empty() ) 1880 convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1); 1881 ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); 1882 1883 if( !haveMask ) 1884 { 1885 if(n == 0) 1886 k.args(src1arg, dstarg, scalararg); 1887 else if(n == 1) 1888 k.args(src1arg, dstarg, scalararg, 1889 ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); 1890 else 1891 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); 1892 } 1893 else 1894 k.args(src1arg, maskarg, dstarg, scalararg); 1895 } 1896 else 1897 { 1898 src2 = _src2.getUMat(); 1899 ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn); 1900 1901 if( !haveMask ) 1902 { 1903 if (n == 0) 1904 k.args(src1arg, src2arg, dstarg); 1905 else if (n == 1) 1906 k.args(src1arg, src2arg, dstarg, 1907 ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); 1908 else if (n == 3) 1909 k.args(src1arg, src2arg, dstarg, 1910 ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz), 1911 ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz), 1912 ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz)); 1913 else 1914 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); 1915 } 1916 else 1917 k.args(src1arg, src2arg, maskarg, dstarg); 1918 } 1919 1920 size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI }; 1921 return k.run(2, globalsize, NULL, false); 1922 } 1923 1924 #endif 1925 1926 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, 1927 InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false, 1928 void* usrdata=0, int oclop=-1 ) 1929 { 1930 const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; 1931 int kind1 = psrc1->kind(), kind2 = psrc2->kind(); 1932 bool haveMask = !_mask.empty(); 1933 bool reallocate = false; 1934 int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); 1935 int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2); 1936 int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims(); 1937 Size sz1 = dims1 <= 2 ? psrc1->size() : Size(); 1938 Size sz2 = dims2 <= 2 ? psrc2->size() : Size(); 1939 #ifdef HAVE_OPENCL 1940 bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2; 1941 #endif 1942 bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2); 1943 bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1); 1944 1945 if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 && 1946 !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) || 1947 (_dst.fixedType() && _dst.type() == type1)) && 1948 ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) ) 1949 { 1950 _dst.createSameSize(*psrc1, type1); 1951 CV_OCL_RUN(use_opencl, 1952 ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, 1953 (!usrdata ? type1 : std::max(depth1, CV_32F)), 1954 usrdata, oclop, false)) 1955 1956 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); 1957 Size sz = getContinuousSize(src1, src2, dst, src1.channels()); 1958 tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, usrdata); 1959 return; 1960 } 1961 1962 bool haveScalar = false, swapped12 = false; 1963 1964 if( dims1 != dims2 || sz1 != sz2 || cn != cn2 || 1965 (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) || 1966 (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) ) 1967 { 1968 if( checkScalar(*psrc1, type2, kind1, kind2) ) 1969 { 1970 // src1 is a scalar; swap it with src2 1971 swap(psrc1, psrc2); 1972 swap(sz1, sz2); 1973 swap(type1, type2); 1974 swap(depth1, depth2); 1975 swap(cn, cn2); 1976 swap(dims1, dims2); 1977 swapped12 = true; 1978 if( oclop == OCL_OP_SUB ) 1979 oclop = OCL_OP_RSUB; 1980 if ( oclop == OCL_OP_DIV_SCALE ) 1981 oclop = OCL_OP_RDIV_SCALE; 1982 } 1983 else if( !checkScalar(*psrc2, type1, kind2, kind1) ) 1984 CV_Error( CV_StsUnmatchedSizes, 1985 "The operation is neither 'array op array' " 1986 "(where arrays have the same size and the same number of channels), " 1987 "nor 'array op scalar', nor 'scalar op array'" ); 1988 haveScalar = true; 1989 CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4)); 1990 1991 if (!muldiv) 1992 { 1993 Mat sc = psrc2->getMat(); 1994 depth2 = actualScalarDepth(sc.ptr<double>(), cn); 1995 if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) ) 1996 depth2 = CV_32F; 1997 } 1998 else 1999 depth2 = CV_64F; 2000 } 2001 2002 if( dtype < 0 ) 2003 { 2004 if( _dst.fixedType() ) 2005 dtype = _dst.type(); 2006 else 2007 { 2008 if( !haveScalar && type1 != type2 ) 2009 CV_Error(CV_StsBadArg, 2010 "When the input arrays in add/subtract/multiply/divide functions have different types, " 2011 "the output array type must be explicitly specified"); 2012 dtype = type1; 2013 } 2014 } 2015 dtype = CV_MAT_DEPTH(dtype); 2016 2017 if( depth1 == depth2 && dtype == depth1 ) 2018 wtype = dtype; 2019 else if( !muldiv ) 2020 { 2021 wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S : 2022 depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2); 2023 wtype = std::max(wtype, dtype); 2024 2025 // when the result of addition should be converted to an integer type, 2026 // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation, 2027 // instead of converting the other input to floating-point and then converting the operation result back to integers. 2028 if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) ) 2029 wtype = CV_32S; 2030 } 2031 else 2032 { 2033 wtype = std::max(depth1, std::max(depth2, CV_32F)); 2034 wtype = std::max(wtype, dtype); 2035 } 2036 2037 dtype = CV_MAKETYPE(dtype, cn); 2038 wtype = CV_MAKETYPE(wtype, cn); 2039 2040 if( haveMask ) 2041 { 2042 int mtype = _mask.type(); 2043 CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) ); 2044 reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype; 2045 } 2046 2047 _dst.createSameSize(*psrc1, dtype); 2048 if( reallocate ) 2049 _dst.setTo(0.); 2050 2051 CV_OCL_RUN(use_opencl, 2052 ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype, 2053 usrdata, oclop, haveScalar)) 2054 2055 BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype); 2056 BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype); 2057 BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype); 2058 2059 size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2); 2060 size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype); 2061 size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz; 2062 BinaryFunc copymask = getCopyMaskFunc(dsz); 2063 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat(); 2064 2065 AutoBuffer<uchar> _buf; 2066 uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0; 2067 size_t bufesz = (cvtsrc1 ? wsz : 0) + 2068 (cvtsrc2 || haveScalar ? wsz : 0) + 2069 (cvtdst ? wsz : 0) + 2070 (haveMask ? dsz : 0); 2071 BinaryFunc func = tab[CV_MAT_DEPTH(wtype)]; 2072 2073 if( !haveScalar ) 2074 { 2075 const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 }; 2076 uchar* ptrs[4]; 2077 2078 NAryMatIterator it(arrays, ptrs); 2079 size_t total = it.size, blocksize = total; 2080 2081 if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst ) 2082 blocksize = std::min(blocksize, blocksize0); 2083 2084 _buf.allocate(bufesz*blocksize + 64); 2085 buf = _buf; 2086 if( cvtsrc1 ) 2087 buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); 2088 if( cvtsrc2 ) 2089 buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16); 2090 wbuf = maskbuf = buf; 2091 if( cvtdst ) 2092 buf = alignPtr(buf + blocksize*wsz, 16); 2093 if( haveMask ) 2094 maskbuf = buf; 2095 2096 for( size_t i = 0; i < it.nplanes; i++, ++it ) 2097 { 2098 for( size_t j = 0; j < total; j += blocksize ) 2099 { 2100 int bsz = (int)MIN(total - j, blocksize); 2101 Size bszn(bsz*cn, 1); 2102 const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1]; 2103 uchar* dptr = ptrs[2]; 2104 if( cvtsrc1 ) 2105 { 2106 cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); 2107 sptr1 = buf1; 2108 } 2109 if( ptrs[0] == ptrs[1] ) 2110 sptr2 = sptr1; 2111 else if( cvtsrc2 ) 2112 { 2113 cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 ); 2114 sptr2 = buf2; 2115 } 2116 2117 if( !haveMask && !cvtdst ) 2118 func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata ); 2119 else 2120 { 2121 func( sptr1, 1, sptr2, 1, wbuf, 0, bszn, usrdata ); 2122 if( !haveMask ) 2123 cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); 2124 else if( !cvtdst ) 2125 { 2126 copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); 2127 ptrs[3] += bsz; 2128 } 2129 else 2130 { 2131 cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); 2132 copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); 2133 ptrs[3] += bsz; 2134 } 2135 } 2136 ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz; 2137 } 2138 } 2139 } 2140 else 2141 { 2142 const Mat* arrays[] = { &src1, &dst, &mask, 0 }; 2143 uchar* ptrs[3]; 2144 2145 NAryMatIterator it(arrays, ptrs); 2146 size_t total = it.size, blocksize = std::min(total, blocksize0); 2147 2148 _buf.allocate(bufesz*blocksize + 64); 2149 buf = _buf; 2150 if( cvtsrc1 ) 2151 buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); 2152 buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16); 2153 wbuf = maskbuf = buf; 2154 if( cvtdst ) 2155 buf = alignPtr(buf + blocksize*wsz, 16); 2156 if( haveMask ) 2157 maskbuf = buf; 2158 2159 convertAndUnrollScalar( src2, wtype, buf2, blocksize); 2160 2161 for( size_t i = 0; i < it.nplanes; i++, ++it ) 2162 { 2163 for( size_t j = 0; j < total; j += blocksize ) 2164 { 2165 int bsz = (int)MIN(total - j, blocksize); 2166 Size bszn(bsz*cn, 1); 2167 const uchar *sptr1 = ptrs[0]; 2168 const uchar* sptr2 = buf2; 2169 uchar* dptr = ptrs[1]; 2170 2171 if( cvtsrc1 ) 2172 { 2173 cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); 2174 sptr1 = buf1; 2175 } 2176 2177 if( swapped12 ) 2178 std::swap(sptr1, sptr2); 2179 2180 if( !haveMask && !cvtdst ) 2181 func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata ); 2182 else 2183 { 2184 func( sptr1, 1, sptr2, 1, wbuf, 1, bszn, usrdata ); 2185 if( !haveMask ) 2186 cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); 2187 else if( !cvtdst ) 2188 { 2189 copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); 2190 ptrs[2] += bsz; 2191 } 2192 else 2193 { 2194 cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); 2195 copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); 2196 ptrs[2] += bsz; 2197 } 2198 } 2199 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz; 2200 } 2201 } 2202 } 2203 } 2204 2205 static BinaryFunc* getAddTab() 2206 { 2207 static BinaryFunc addTab[] = 2208 { 2209 (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s), 2210 (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s), 2211 (BinaryFunc)GET_OPTIMIZED(add32s), 2212 (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f, 2213 0 2214 }; 2215 2216 return addTab; 2217 } 2218 2219 static BinaryFunc* getSubTab() 2220 { 2221 static BinaryFunc subTab[] = 2222 { 2223 (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s), 2224 (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s), 2225 (BinaryFunc)GET_OPTIMIZED(sub32s), 2226 (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f, 2227 0 2228 }; 2229 2230 return subTab; 2231 } 2232 2233 static BinaryFunc* getAbsDiffTab() 2234 { 2235 static BinaryFunc absDiffTab[] = 2236 { 2237 (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s), 2238 (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s), 2239 (BinaryFunc)GET_OPTIMIZED(absdiff32s), 2240 (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f, 2241 0 2242 }; 2243 2244 return absDiffTab; 2245 } 2246 2247 } 2248 2249 void cv::add( InputArray src1, InputArray src2, OutputArray dst, 2250 InputArray mask, int dtype ) 2251 { 2252 arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD ); 2253 } 2254 2255 void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst, 2256 InputArray mask, int dtype ) 2257 { 2258 #ifdef HAVE_TEGRA_OPTIMIZATION 2259 if (tegra::useTegra()) 2260 { 2261 int kind1 = _src1.kind(), kind2 = _src2.kind(); 2262 Mat src1 = _src1.getMat(), src2 = _src2.getMat(); 2263 bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2); 2264 bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1); 2265 2266 if (!src1Scalar && !src2Scalar && 2267 src1.depth() == CV_8U && src2.type() == src1.type() && 2268 src1.dims == 2 && src2.size() == src1.size() && 2269 mask.empty()) 2270 { 2271 if (dtype < 0) 2272 { 2273 if (_dst.fixedType()) 2274 { 2275 dtype = _dst.depth(); 2276 } 2277 else 2278 { 2279 dtype = src1.depth(); 2280 } 2281 } 2282 2283 dtype = CV_MAT_DEPTH(dtype); 2284 2285 if (!_dst.fixedType() || dtype == _dst.depth()) 2286 { 2287 _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels())); 2288 2289 if (dtype == CV_16S) 2290 { 2291 Mat dst = _dst.getMat(); 2292 if(tegra::subtract_8u8u16s(src1, src2, dst)) 2293 return; 2294 } 2295 else if (dtype == CV_32F) 2296 { 2297 Mat dst = _dst.getMat(); 2298 if(tegra::subtract_8u8u32f(src1, src2, dst)) 2299 return; 2300 } 2301 else if (dtype == CV_8S) 2302 { 2303 Mat dst = _dst.getMat(); 2304 if(tegra::subtract_8u8u8s(src1, src2, dst)) 2305 return; 2306 } 2307 } 2308 } 2309 } 2310 #endif 2311 arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB ); 2312 } 2313 2314 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst ) 2315 { 2316 arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF); 2317 } 2318 2319 /****************************************************************************************\ 2320 * multiply/divide * 2321 \****************************************************************************************/ 2322 2323 namespace cv 2324 { 2325 2326 template <typename T, typename WT> 2327 struct Mul_SIMD 2328 { 2329 int operator() (const T *, const T *, T *, int, WT) const 2330 { 2331 return 0; 2332 } 2333 }; 2334 2335 #if CV_NEON 2336 2337 template <> 2338 struct Mul_SIMD<uchar, float> 2339 { 2340 int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const 2341 { 2342 int x = 0; 2343 2344 if( scale == 1.0f ) 2345 for ( ; x <= width - 8; x += 8) 2346 { 2347 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); 2348 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); 2349 2350 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), 2351 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); 2352 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), 2353 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); 2354 2355 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 2356 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 2357 vst1_u8(dst + x, vqmovn_u16(v_dst)); 2358 } 2359 else 2360 { 2361 float32x4_t v_scale = vdupq_n_f32(scale); 2362 for ( ; x <= width - 8; x += 8) 2363 { 2364 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); 2365 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); 2366 2367 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), 2368 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); 2369 v_dst1 = vmulq_f32(v_dst1, v_scale); 2370 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), 2371 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); 2372 v_dst2 = vmulq_f32(v_dst2, v_scale); 2373 2374 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 2375 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 2376 vst1_u8(dst + x, vqmovn_u16(v_dst)); 2377 } 2378 } 2379 2380 return x; 2381 } 2382 }; 2383 2384 template <> 2385 struct Mul_SIMD<schar, float> 2386 { 2387 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const 2388 { 2389 int x = 0; 2390 2391 if( scale == 1.0f ) 2392 for ( ; x <= width - 8; x += 8) 2393 { 2394 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); 2395 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); 2396 2397 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), 2398 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); 2399 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), 2400 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); 2401 2402 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 2403 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 2404 vst1_s8(dst + x, vqmovn_s16(v_dst)); 2405 } 2406 else 2407 { 2408 float32x4_t v_scale = vdupq_n_f32(scale); 2409 for ( ; x <= width - 8; x += 8) 2410 { 2411 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); 2412 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); 2413 2414 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), 2415 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); 2416 v_dst1 = vmulq_f32(v_dst1, v_scale); 2417 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), 2418 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); 2419 v_dst2 = vmulq_f32(v_dst2, v_scale); 2420 2421 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 2422 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 2423 vst1_s8(dst + x, vqmovn_s16(v_dst)); 2424 } 2425 } 2426 2427 return x; 2428 } 2429 }; 2430 2431 template <> 2432 struct Mul_SIMD<ushort, float> 2433 { 2434 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const 2435 { 2436 int x = 0; 2437 2438 if( scale == 1.0f ) 2439 for ( ; x <= width - 8; x += 8) 2440 { 2441 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); 2442 2443 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), 2444 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); 2445 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), 2446 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); 2447 2448 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 2449 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 2450 vst1q_u16(dst + x, v_dst); 2451 } 2452 else 2453 { 2454 float32x4_t v_scale = vdupq_n_f32(scale); 2455 for ( ; x <= width - 8; x += 8) 2456 { 2457 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); 2458 2459 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), 2460 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); 2461 v_dst1 = vmulq_f32(v_dst1, v_scale); 2462 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), 2463 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); 2464 v_dst2 = vmulq_f32(v_dst2, v_scale); 2465 2466 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 2467 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 2468 vst1q_u16(dst + x, v_dst); 2469 } 2470 } 2471 2472 return x; 2473 } 2474 }; 2475 2476 template <> 2477 struct Mul_SIMD<short, float> 2478 { 2479 int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const 2480 { 2481 int x = 0; 2482 2483 if( scale == 1.0f ) 2484 for ( ; x <= width - 8; x += 8) 2485 { 2486 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); 2487 2488 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), 2489 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); 2490 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), 2491 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); 2492 2493 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 2494 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 2495 vst1q_s16(dst + x, v_dst); 2496 } 2497 else 2498 { 2499 float32x4_t v_scale = vdupq_n_f32(scale); 2500 for ( ; x <= width - 8; x += 8) 2501 { 2502 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); 2503 2504 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), 2505 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); 2506 v_dst1 = vmulq_f32(v_dst1, v_scale); 2507 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), 2508 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); 2509 v_dst2 = vmulq_f32(v_dst2, v_scale); 2510 2511 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 2512 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 2513 vst1q_s16(dst + x, v_dst); 2514 } 2515 } 2516 2517 return x; 2518 } 2519 }; 2520 2521 template <> 2522 struct Mul_SIMD<float, float> 2523 { 2524 int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const 2525 { 2526 int x = 0; 2527 2528 if( scale == 1.0f ) 2529 for ( ; x <= width - 8; x += 8) 2530 { 2531 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 2532 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 2533 vst1q_f32(dst + x, v_dst1); 2534 vst1q_f32(dst + x + 4, v_dst2); 2535 } 2536 else 2537 { 2538 float32x4_t v_scale = vdupq_n_f32(scale); 2539 for ( ; x <= width - 8; x += 8) 2540 { 2541 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 2542 v_dst1 = vmulq_f32(v_dst1, v_scale); 2543 2544 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 2545 v_dst2 = vmulq_f32(v_dst2, v_scale); 2546 2547 vst1q_f32(dst + x, v_dst1); 2548 vst1q_f32(dst + x + 4, v_dst2); 2549 } 2550 } 2551 2552 return x; 2553 } 2554 }; 2555 2556 #elif CV_SSE2 2557 2558 #if CV_SSE4_1 2559 2560 template <> 2561 struct Mul_SIMD<ushort, float> 2562 { 2563 Mul_SIMD() 2564 { 2565 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 2566 } 2567 2568 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const 2569 { 2570 int x = 0; 2571 2572 if (!haveSSE) 2573 return x; 2574 2575 __m128i v_zero = _mm_setzero_si128(); 2576 2577 if( scale != 1.0f ) 2578 { 2579 __m128 v_scale = _mm_set1_ps(scale); 2580 for ( ; x <= width - 8; x += 8) 2581 { 2582 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); 2583 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); 2584 2585 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), 2586 _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero))); 2587 v_dst1 = _mm_mul_ps(v_dst1, v_scale); 2588 2589 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), 2590 _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero))); 2591 v_dst2 = _mm_mul_ps(v_dst2, v_scale); 2592 2593 __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); 2594 _mm_storeu_si128((__m128i *)(dst + x), v_dsti); 2595 } 2596 } 2597 2598 return x; 2599 } 2600 2601 bool haveSSE; 2602 }; 2603 2604 #endif 2605 2606 template <> 2607 struct Mul_SIMD<schar, float> 2608 { 2609 Mul_SIMD() 2610 { 2611 haveSSE = checkHardwareSupport(CV_CPU_SSE2); 2612 } 2613 2614 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const 2615 { 2616 int x = 0; 2617 2618 if (!haveSSE) 2619 return x; 2620 2621 __m128i v_zero = _mm_setzero_si128(); 2622 2623 if( scale == 1.0f ) 2624 for ( ; x <= width - 8; x += 8) 2625 { 2626 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); 2627 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); 2628 2629 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); 2630 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); 2631 2632 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), 2633 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); 2634 2635 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), 2636 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); 2637 2638 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); 2639 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); 2640 } 2641 else 2642 { 2643 __m128 v_scale = _mm_set1_ps(scale); 2644 for ( ; x <= width - 8; x += 8) 2645 { 2646 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); 2647 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); 2648 2649 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); 2650 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); 2651 2652 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), 2653 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); 2654 v_dst1 = _mm_mul_ps(v_dst1, v_scale); 2655 2656 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), 2657 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); 2658 v_dst2 = _mm_mul_ps(v_dst2, v_scale); 2659 2660 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); 2661 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); 2662 } 2663 } 2664 2665 return x; 2666 } 2667 2668 bool haveSSE; 2669 }; 2670 2671 template <> 2672 struct Mul_SIMD<short, float> 2673 { 2674 Mul_SIMD() 2675 { 2676 haveSSE = checkHardwareSupport(CV_CPU_SSE2); 2677 } 2678 2679 int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const 2680 { 2681 int x = 0; 2682 2683 if (!haveSSE) 2684 return x; 2685 2686 __m128i v_zero = _mm_setzero_si128(); 2687 2688 if( scale != 1.0f ) 2689 { 2690 __m128 v_scale = _mm_set1_ps(scale); 2691 for ( ; x <= width - 8; x += 8) 2692 { 2693 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); 2694 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); 2695 2696 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), 2697 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); 2698 v_dst1 = _mm_mul_ps(v_dst1, v_scale); 2699 2700 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), 2701 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); 2702 v_dst2 = _mm_mul_ps(v_dst2, v_scale); 2703 2704 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); 2705 _mm_storeu_si128((__m128i *)(dst + x), v_dsti); 2706 } 2707 } 2708 2709 return x; 2710 } 2711 2712 bool haveSSE; 2713 }; 2714 2715 #endif 2716 2717 template<typename T, typename WT> static void 2718 mul_( const T* src1, size_t step1, const T* src2, size_t step2, 2719 T* dst, size_t step, Size size, WT scale ) 2720 { 2721 step1 /= sizeof(src1[0]); 2722 step2 /= sizeof(src2[0]); 2723 step /= sizeof(dst[0]); 2724 2725 Mul_SIMD<T, WT> vop; 2726 2727 if( scale == (WT)1. ) 2728 { 2729 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 2730 { 2731 int i = vop(src1, src2, dst, size.width, scale); 2732 #if CV_ENABLE_UNROLLED 2733 for(; i <= size.width - 4; i += 4 ) 2734 { 2735 T t0; 2736 T t1; 2737 t0 = saturate_cast<T>(src1[i ] * src2[i ]); 2738 t1 = saturate_cast<T>(src1[i+1] * src2[i+1]); 2739 dst[i ] = t0; 2740 dst[i+1] = t1; 2741 2742 t0 = saturate_cast<T>(src1[i+2] * src2[i+2]); 2743 t1 = saturate_cast<T>(src1[i+3] * src2[i+3]); 2744 dst[i+2] = t0; 2745 dst[i+3] = t1; 2746 } 2747 #endif 2748 for( ; i < size.width; i++ ) 2749 dst[i] = saturate_cast<T>(src1[i] * src2[i]); 2750 } 2751 } 2752 else 2753 { 2754 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 2755 { 2756 int i = vop(src1, src2, dst, size.width, scale); 2757 #if CV_ENABLE_UNROLLED 2758 for(; i <= size.width - 4; i += 4 ) 2759 { 2760 T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]); 2761 T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]); 2762 dst[i] = t0; dst[i+1] = t1; 2763 2764 t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]); 2765 t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]); 2766 dst[i+2] = t0; dst[i+3] = t1; 2767 } 2768 #endif 2769 for( ; i < size.width; i++ ) 2770 dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]); 2771 } 2772 } 2773 } 2774 2775 template <typename T> 2776 struct Div_SIMD 2777 { 2778 int operator() (const T *, const T *, T *, int, double) const 2779 { 2780 return 0; 2781 } 2782 }; 2783 2784 template <typename T> 2785 struct Recip_SIMD 2786 { 2787 int operator() (const T *, T *, int, double) const 2788 { 2789 return 0; 2790 } 2791 }; 2792 2793 2794 #if CV_SIMD128 2795 2796 template <> 2797 struct Div_SIMD<uchar> 2798 { 2799 bool haveSIMD; 2800 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 2801 2802 int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const 2803 { 2804 int x = 0; 2805 2806 if (!haveSIMD) 2807 return x; 2808 2809 v_float32x4 v_scale = v_setall_f32((float)scale); 2810 v_uint16x8 v_zero = v_setzero_u16(); 2811 2812 for ( ; x <= width - 8; x += 8) 2813 { 2814 v_uint16x8 v_src1 = v_load_expand(src1 + x); 2815 v_uint16x8 v_src2 = v_load_expand(src2 + x); 2816 2817 v_uint32x4 t0, t1, t2, t3; 2818 v_expand(v_src1, t0, t1); 2819 v_expand(v_src2, t2, t3); 2820 2821 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); 2822 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); 2823 2824 v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); 2825 v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); 2826 2827 f0 = f0 * v_scale / f2; 2828 f1 = f1 * v_scale / f3; 2829 2830 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 2831 v_uint16x8 res = v_pack_u(i0, i1); 2832 2833 res = v_select(v_src2 == v_zero, v_zero, res); 2834 v_pack_store(dst + x, res); 2835 } 2836 2837 return x; 2838 } 2839 }; 2840 2841 2842 template <> 2843 struct Div_SIMD<schar> 2844 { 2845 bool haveSIMD; 2846 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 2847 2848 int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const 2849 { 2850 int x = 0; 2851 2852 if (!haveSIMD) 2853 return x; 2854 2855 v_float32x4 v_scale = v_setall_f32((float)scale); 2856 v_int16x8 v_zero = v_setzero_s16(); 2857 2858 for ( ; x <= width - 8; x += 8) 2859 { 2860 v_int16x8 v_src1 = v_load_expand(src1 + x); 2861 v_int16x8 v_src2 = v_load_expand(src2 + x); 2862 2863 v_int32x4 t0, t1, t2, t3; 2864 v_expand(v_src1, t0, t1); 2865 v_expand(v_src2, t2, t3); 2866 2867 v_float32x4 f0 = v_cvt_f32(t0); 2868 v_float32x4 f1 = v_cvt_f32(t1); 2869 2870 v_float32x4 f2 = v_cvt_f32(t2); 2871 v_float32x4 f3 = v_cvt_f32(t3); 2872 2873 f0 = f0 * v_scale / f2; 2874 f1 = f1 * v_scale / f3; 2875 2876 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 2877 v_int16x8 res = v_pack(i0, i1); 2878 2879 res = v_select(v_src2 == v_zero, v_zero, res); 2880 v_pack_store(dst + x, res); 2881 } 2882 2883 return x; 2884 } 2885 }; 2886 2887 2888 template <> 2889 struct Div_SIMD<ushort> 2890 { 2891 bool haveSIMD; 2892 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 2893 2894 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const 2895 { 2896 int x = 0; 2897 2898 if (!haveSIMD) 2899 return x; 2900 2901 v_float32x4 v_scale = v_setall_f32((float)scale); 2902 v_uint16x8 v_zero = v_setzero_u16(); 2903 2904 for ( ; x <= width - 8; x += 8) 2905 { 2906 v_uint16x8 v_src1 = v_load(src1 + x); 2907 v_uint16x8 v_src2 = v_load(src2 + x); 2908 2909 v_uint32x4 t0, t1, t2, t3; 2910 v_expand(v_src1, t0, t1); 2911 v_expand(v_src2, t2, t3); 2912 2913 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); 2914 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); 2915 2916 v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); 2917 v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); 2918 2919 f0 = f0 * v_scale / f2; 2920 f1 = f1 * v_scale / f3; 2921 2922 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 2923 v_uint16x8 res = v_pack_u(i0, i1); 2924 2925 res = v_select(v_src2 == v_zero, v_zero, res); 2926 v_store(dst + x, res); 2927 } 2928 2929 return x; 2930 } 2931 }; 2932 2933 template <> 2934 struct Div_SIMD<short> 2935 { 2936 bool haveSIMD; 2937 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 2938 2939 int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const 2940 { 2941 int x = 0; 2942 2943 if (!haveSIMD) 2944 return x; 2945 2946 v_float32x4 v_scale = v_setall_f32((float)scale); 2947 v_int16x8 v_zero = v_setzero_s16(); 2948 2949 for ( ; x <= width - 8; x += 8) 2950 { 2951 v_int16x8 v_src1 = v_load(src1 + x); 2952 v_int16x8 v_src2 = v_load(src2 + x); 2953 2954 v_int32x4 t0, t1, t2, t3; 2955 v_expand(v_src1, t0, t1); 2956 v_expand(v_src2, t2, t3); 2957 2958 v_float32x4 f0 = v_cvt_f32(t0); 2959 v_float32x4 f1 = v_cvt_f32(t1); 2960 2961 v_float32x4 f2 = v_cvt_f32(t2); 2962 v_float32x4 f3 = v_cvt_f32(t3); 2963 2964 f0 = f0 * v_scale / f2; 2965 f1 = f1 * v_scale / f3; 2966 2967 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 2968 v_int16x8 res = v_pack(i0, i1); 2969 2970 res = v_select(v_src2 == v_zero, v_zero, res); 2971 v_store(dst + x, res); 2972 } 2973 2974 return x; 2975 } 2976 }; 2977 2978 template <> 2979 struct Div_SIMD<int> 2980 { 2981 bool haveSIMD; 2982 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 2983 2984 int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const 2985 { 2986 int x = 0; 2987 2988 if (!haveSIMD) 2989 return x; 2990 2991 v_float32x4 v_scale = v_setall_f32((float)scale); 2992 v_int32x4 v_zero = v_setzero_s32(); 2993 2994 for ( ; x <= width - 8; x += 8) 2995 { 2996 v_int32x4 t0 = v_load(src1 + x); 2997 v_int32x4 t1 = v_load(src1 + x + 4); 2998 v_int32x4 t2 = v_load(src2 + x); 2999 v_int32x4 t3 = v_load(src2 + x + 4); 3000 3001 v_float32x4 f0 = v_cvt_f32(t0); 3002 v_float32x4 f1 = v_cvt_f32(t1); 3003 v_float32x4 f2 = v_cvt_f32(t2); 3004 v_float32x4 f3 = v_cvt_f32(t3); 3005 3006 f0 = f0 * v_scale / f2; 3007 f1 = f1 * v_scale / f3; 3008 3009 v_int32x4 res0 = v_round(f0), res1 = v_round(f1); 3010 3011 res0 = v_select(t2 == v_zero, v_zero, res0); 3012 res1 = v_select(t3 == v_zero, v_zero, res1); 3013 v_store(dst + x, res0); 3014 v_store(dst + x + 4, res1); 3015 } 3016 3017 return x; 3018 } 3019 }; 3020 3021 3022 template <> 3023 struct Div_SIMD<float> 3024 { 3025 bool haveSIMD; 3026 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 3027 3028 int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const 3029 { 3030 int x = 0; 3031 3032 if (!haveSIMD) 3033 return x; 3034 3035 v_float32x4 v_scale = v_setall_f32((float)scale); 3036 v_float32x4 v_zero = v_setzero_f32(); 3037 3038 for ( ; x <= width - 8; x += 8) 3039 { 3040 v_float32x4 f0 = v_load(src1 + x); 3041 v_float32x4 f1 = v_load(src1 + x + 4); 3042 v_float32x4 f2 = v_load(src2 + x); 3043 v_float32x4 f3 = v_load(src2 + x + 4); 3044 3045 v_float32x4 res0 = f0 * v_scale / f2; 3046 v_float32x4 res1 = f1 * v_scale / f3; 3047 3048 res0 = v_select(f2 == v_zero, v_zero, res0); 3049 res1 = v_select(f3 == v_zero, v_zero, res1); 3050 3051 v_store(dst + x, res0); 3052 v_store(dst + x + 4, res1); 3053 } 3054 3055 return x; 3056 } 3057 }; 3058 3059 3060 ///////////////////////// RECIPROCAL ////////////////////// 3061 3062 template <> 3063 struct Recip_SIMD<uchar> 3064 { 3065 bool haveSIMD; 3066 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 3067 3068 int operator() (const uchar * src2, uchar * dst, int width, double scale) const 3069 { 3070 int x = 0; 3071 3072 if (!haveSIMD) 3073 return x; 3074 3075 v_float32x4 v_scale = v_setall_f32((float)scale); 3076 v_uint16x8 v_zero = v_setzero_u16(); 3077 3078 for ( ; x <= width - 8; x += 8) 3079 { 3080 v_uint16x8 v_src2 = v_load_expand(src2 + x); 3081 3082 v_uint32x4 t0, t1; 3083 v_expand(v_src2, t0, t1); 3084 3085 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); 3086 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); 3087 3088 f0 = v_scale / f0; 3089 f1 = v_scale / f1; 3090 3091 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 3092 v_uint16x8 res = v_pack_u(i0, i1); 3093 3094 res = v_select(v_src2 == v_zero, v_zero, res); 3095 v_pack_store(dst + x, res); 3096 } 3097 3098 return x; 3099 } 3100 }; 3101 3102 3103 template <> 3104 struct Recip_SIMD<schar> 3105 { 3106 bool haveSIMD; 3107 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 3108 3109 int operator() (const schar * src2, schar * dst, int width, double scale) const 3110 { 3111 int x = 0; 3112 3113 if (!haveSIMD) 3114 return x; 3115 3116 v_float32x4 v_scale = v_setall_f32((float)scale); 3117 v_int16x8 v_zero = v_setzero_s16(); 3118 3119 for ( ; x <= width - 8; x += 8) 3120 { 3121 v_int16x8 v_src2 = v_load_expand(src2 + x); 3122 3123 v_int32x4 t0, t1; 3124 v_expand(v_src2, t0, t1); 3125 3126 v_float32x4 f0 = v_cvt_f32(t0); 3127 v_float32x4 f1 = v_cvt_f32(t1); 3128 3129 f0 = v_scale / f0; 3130 f1 = v_scale / f1; 3131 3132 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 3133 v_int16x8 res = v_pack(i0, i1); 3134 3135 res = v_select(v_src2 == v_zero, v_zero, res); 3136 v_pack_store(dst + x, res); 3137 } 3138 3139 return x; 3140 } 3141 }; 3142 3143 3144 template <> 3145 struct Recip_SIMD<ushort> 3146 { 3147 bool haveSIMD; 3148 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 3149 3150 int operator() (const ushort * src2, ushort * dst, int width, double scale) const 3151 { 3152 int x = 0; 3153 3154 if (!haveSIMD) 3155 return x; 3156 3157 v_float32x4 v_scale = v_setall_f32((float)scale); 3158 v_uint16x8 v_zero = v_setzero_u16(); 3159 3160 for ( ; x <= width - 8; x += 8) 3161 { 3162 v_uint16x8 v_src2 = v_load(src2 + x); 3163 3164 v_uint32x4 t0, t1; 3165 v_expand(v_src2, t0, t1); 3166 3167 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); 3168 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); 3169 3170 f0 = v_scale / f0; 3171 f1 = v_scale / f1; 3172 3173 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 3174 v_uint16x8 res = v_pack_u(i0, i1); 3175 3176 res = v_select(v_src2 == v_zero, v_zero, res); 3177 v_store(dst + x, res); 3178 } 3179 3180 return x; 3181 } 3182 }; 3183 3184 template <> 3185 struct Recip_SIMD<short> 3186 { 3187 bool haveSIMD; 3188 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 3189 3190 int operator() (const short * src2, short * dst, int width, double scale) const 3191 { 3192 int x = 0; 3193 3194 if (!haveSIMD) 3195 return x; 3196 3197 v_float32x4 v_scale = v_setall_f32((float)scale); 3198 v_int16x8 v_zero = v_setzero_s16(); 3199 3200 for ( ; x <= width - 8; x += 8) 3201 { 3202 v_int16x8 v_src2 = v_load(src2 + x); 3203 3204 v_int32x4 t0, t1; 3205 v_expand(v_src2, t0, t1); 3206 3207 v_float32x4 f0 = v_cvt_f32(t0); 3208 v_float32x4 f1 = v_cvt_f32(t1); 3209 3210 f0 = v_scale / f0; 3211 f1 = v_scale / f1; 3212 3213 v_int32x4 i0 = v_round(f0), i1 = v_round(f1); 3214 v_int16x8 res = v_pack(i0, i1); 3215 3216 res = v_select(v_src2 == v_zero, v_zero, res); 3217 v_store(dst + x, res); 3218 } 3219 3220 return x; 3221 } 3222 }; 3223 3224 template <> 3225 struct Recip_SIMD<int> 3226 { 3227 bool haveSIMD; 3228 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 3229 3230 int operator() (const int * src2, int * dst, int width, double scale) const 3231 { 3232 int x = 0; 3233 3234 if (!haveSIMD) 3235 return x; 3236 3237 v_float32x4 v_scale = v_setall_f32((float)scale); 3238 v_int32x4 v_zero = v_setzero_s32(); 3239 3240 for ( ; x <= width - 8; x += 8) 3241 { 3242 v_int32x4 t0 = v_load(src2 + x); 3243 v_int32x4 t1 = v_load(src2 + x + 4); 3244 3245 v_float32x4 f0 = v_cvt_f32(t0); 3246 v_float32x4 f1 = v_cvt_f32(t1); 3247 3248 f0 = v_scale / f0; 3249 f1 = v_scale / f1; 3250 3251 v_int32x4 res0 = v_round(f0), res1 = v_round(f1); 3252 3253 res0 = v_select(t0 == v_zero, v_zero, res0); 3254 res1 = v_select(t1 == v_zero, v_zero, res1); 3255 v_store(dst + x, res0); 3256 v_store(dst + x + 4, res1); 3257 } 3258 3259 return x; 3260 } 3261 }; 3262 3263 3264 template <> 3265 struct Recip_SIMD<float> 3266 { 3267 bool haveSIMD; 3268 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 3269 3270 int operator() (const float * src2, float * dst, int width, double scale) const 3271 { 3272 int x = 0; 3273 3274 if (!haveSIMD) 3275 return x; 3276 3277 v_float32x4 v_scale = v_setall_f32((float)scale); 3278 v_float32x4 v_zero = v_setzero_f32(); 3279 3280 for ( ; x <= width - 8; x += 8) 3281 { 3282 v_float32x4 f0 = v_load(src2 + x); 3283 v_float32x4 f1 = v_load(src2 + x + 4); 3284 3285 v_float32x4 res0 = v_scale / f0; 3286 v_float32x4 res1 = v_scale / f1; 3287 3288 res0 = v_select(f0 == v_zero, v_zero, res0); 3289 res1 = v_select(f1 == v_zero, v_zero, res1); 3290 3291 v_store(dst + x, res0); 3292 v_store(dst + x + 4, res1); 3293 } 3294 3295 return x; 3296 } 3297 }; 3298 3299 #if CV_SIMD128_64F 3300 3301 template <> 3302 struct Div_SIMD<double> 3303 { 3304 bool haveSIMD; 3305 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 3306 3307 int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const 3308 { 3309 int x = 0; 3310 3311 if (!haveSIMD) 3312 return x; 3313 3314 v_float64x2 v_scale = v_setall_f64(scale); 3315 v_float64x2 v_zero = v_setzero_f64(); 3316 3317 for ( ; x <= width - 4; x += 4) 3318 { 3319 v_float64x2 f0 = v_load(src1 + x); 3320 v_float64x2 f1 = v_load(src1 + x + 2); 3321 v_float64x2 f2 = v_load(src2 + x); 3322 v_float64x2 f3 = v_load(src2 + x + 2); 3323 3324 v_float64x2 res0 = f0 * v_scale / f2; 3325 v_float64x2 res1 = f1 * v_scale / f3; 3326 3327 res0 = v_select(f0 == v_zero, v_zero, res0); 3328 res1 = v_select(f1 == v_zero, v_zero, res1); 3329 3330 v_store(dst + x, res0); 3331 v_store(dst + x + 2, res1); 3332 } 3333 3334 return x; 3335 } 3336 }; 3337 3338 template <> 3339 struct Recip_SIMD<double> 3340 { 3341 bool haveSIMD; 3342 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } 3343 3344 int operator() (const double * src2, double * dst, int width, double scale) const 3345 { 3346 int x = 0; 3347 3348 if (!haveSIMD) 3349 return x; 3350 3351 v_float64x2 v_scale = v_setall_f64(scale); 3352 v_float64x2 v_zero = v_setzero_f64(); 3353 3354 for ( ; x <= width - 4; x += 4) 3355 { 3356 v_float64x2 f0 = v_load(src2 + x); 3357 v_float64x2 f1 = v_load(src2 + x + 2); 3358 3359 v_float64x2 res0 = v_scale / f0; 3360 v_float64x2 res1 = v_scale / f1; 3361 3362 res0 = v_select(f0 == v_zero, v_zero, res0); 3363 res1 = v_select(f1 == v_zero, v_zero, res1); 3364 3365 v_store(dst + x, res0); 3366 v_store(dst + x + 2, res1); 3367 } 3368 3369 return x; 3370 } 3371 }; 3372 3373 #endif 3374 3375 #endif 3376 3377 template<typename T> static void 3378 div_i( const T* src1, size_t step1, const T* src2, size_t step2, 3379 T* dst, size_t step, Size size, double scale ) 3380 { 3381 step1 /= sizeof(src1[0]); 3382 step2 /= sizeof(src2[0]); 3383 step /= sizeof(dst[0]); 3384 3385 Div_SIMD<T> vop; 3386 float scale_f = (float)scale; 3387 3388 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 3389 { 3390 int i = vop(src1, src2, dst, size.width, scale); 3391 for( ; i < size.width; i++ ) 3392 { 3393 T num = src1[i], denom = src2[i]; 3394 dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0; 3395 } 3396 } 3397 } 3398 3399 template<typename T> static void 3400 div_f( const T* src1, size_t step1, const T* src2, size_t step2, 3401 T* dst, size_t step, Size size, double scale ) 3402 { 3403 T scale_f = (T)scale; 3404 step1 /= sizeof(src1[0]); 3405 step2 /= sizeof(src2[0]); 3406 step /= sizeof(dst[0]); 3407 3408 Div_SIMD<T> vop; 3409 3410 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 3411 { 3412 int i = vop(src1, src2, dst, size.width, scale); 3413 for( ; i < size.width; i++ ) 3414 { 3415 T num = src1[i], denom = src2[i]; 3416 dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0; 3417 } 3418 } 3419 } 3420 3421 template<typename T> static void 3422 recip_i( const T*, size_t, const T* src2, size_t step2, 3423 T* dst, size_t step, Size size, double scale ) 3424 { 3425 step2 /= sizeof(src2[0]); 3426 step /= sizeof(dst[0]); 3427 3428 Recip_SIMD<T> vop; 3429 float scale_f = (float)scale; 3430 3431 for( ; size.height--; src2 += step2, dst += step ) 3432 { 3433 int i = vop(src2, dst, size.width, scale); 3434 for( ; i < size.width; i++ ) 3435 { 3436 T denom = src2[i]; 3437 dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0; 3438 } 3439 } 3440 } 3441 3442 template<typename T> static void 3443 recip_f( const T*, size_t, const T* src2, size_t step2, 3444 T* dst, size_t step, Size size, double scale ) 3445 { 3446 T scale_f = (T)scale; 3447 step2 /= sizeof(src2[0]); 3448 step /= sizeof(dst[0]); 3449 3450 Recip_SIMD<T> vop; 3451 3452 for( ; size.height--; src2 += step2, dst += step ) 3453 { 3454 int i = vop(src2, dst, size.width, scale); 3455 for( ; i < size.width; i++ ) 3456 { 3457 T denom = src2[i]; 3458 dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0; 3459 } 3460 } 3461 } 3462 3463 3464 static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, 3465 uchar* dst, size_t step, Size sz, void* scale) 3466 { 3467 float fscale = (float)*(const double*)scale; 3468 #if defined HAVE_IPP 3469 CV_IPP_CHECK() 3470 { 3471 if (std::fabs(fscale - 1) <= FLT_EPSILON) 3472 { 3473 if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) 3474 { 3475 CV_IMPL_ADD(CV_IMPL_IPP); 3476 return; 3477 } 3478 setIppErrorStatus(); 3479 } 3480 } 3481 #endif 3482 mul_(src1, step1, src2, step2, dst, step, sz, fscale); 3483 } 3484 3485 static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, 3486 schar* dst, size_t step, Size sz, void* scale) 3487 { 3488 mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale); 3489 } 3490 3491 static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, 3492 ushort* dst, size_t step, Size sz, void* scale) 3493 { 3494 float fscale = (float)*(const double*)scale; 3495 #if defined HAVE_IPP 3496 CV_IPP_CHECK() 3497 { 3498 if (std::fabs(fscale - 1) <= FLT_EPSILON) 3499 { 3500 if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) 3501 { 3502 CV_IMPL_ADD(CV_IMPL_IPP); 3503 return; 3504 } 3505 setIppErrorStatus(); 3506 } 3507 } 3508 #endif 3509 mul_(src1, step1, src2, step2, dst, step, sz, fscale); 3510 } 3511 3512 static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, 3513 short* dst, size_t step, Size sz, void* scale) 3514 { 3515 float fscale = (float)*(const double*)scale; 3516 #if defined HAVE_IPP 3517 CV_IPP_CHECK() 3518 { 3519 if (std::fabs(fscale - 1) <= FLT_EPSILON) 3520 { 3521 if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) 3522 { 3523 CV_IMPL_ADD(CV_IMPL_IPP); 3524 return; 3525 } 3526 setIppErrorStatus(); 3527 } 3528 } 3529 #endif 3530 mul_(src1, step1, src2, step2, dst, step, sz, fscale); 3531 } 3532 3533 static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, 3534 int* dst, size_t step, Size sz, void* scale) 3535 { 3536 mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3537 } 3538 3539 static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, 3540 float* dst, size_t step, Size sz, void* scale) 3541 { 3542 float fscale = (float)*(const double*)scale; 3543 #if defined HAVE_IPP 3544 CV_IPP_CHECK() 3545 { 3546 if (std::fabs(fscale - 1) <= FLT_EPSILON) 3547 { 3548 if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0) 3549 { 3550 CV_IMPL_ADD(CV_IMPL_IPP); 3551 return; 3552 } 3553 setIppErrorStatus(); 3554 } 3555 } 3556 #endif 3557 mul_(src1, step1, src2, step2, dst, step, sz, fscale); 3558 } 3559 3560 static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, 3561 double* dst, size_t step, Size sz, void* scale) 3562 { 3563 mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3564 } 3565 3566 static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, 3567 uchar* dst, size_t step, Size sz, void* scale) 3568 { 3569 if( src1 ) 3570 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3571 else 3572 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3573 } 3574 3575 static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, 3576 schar* dst, size_t step, Size sz, void* scale) 3577 { 3578 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3579 } 3580 3581 static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, 3582 ushort* dst, size_t step, Size sz, void* scale) 3583 { 3584 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3585 } 3586 3587 static void div16s( const short* src1, size_t step1, const short* src2, size_t step2, 3588 short* dst, size_t step, Size sz, void* scale) 3589 { 3590 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3591 } 3592 3593 static void div32s( const int* src1, size_t step1, const int* src2, size_t step2, 3594 int* dst, size_t step, Size sz, void* scale) 3595 { 3596 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3597 } 3598 3599 static void div32f( const float* src1, size_t step1, const float* src2, size_t step2, 3600 float* dst, size_t step, Size sz, void* scale) 3601 { 3602 div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3603 } 3604 3605 static void div64f( const double* src1, size_t step1, const double* src2, size_t step2, 3606 double* dst, size_t step, Size sz, void* scale) 3607 { 3608 div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3609 } 3610 3611 static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, 3612 uchar* dst, size_t step, Size sz, void* scale) 3613 { 3614 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3615 } 3616 3617 static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, 3618 schar* dst, size_t step, Size sz, void* scale) 3619 { 3620 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3621 } 3622 3623 static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, 3624 ushort* dst, size_t step, Size sz, void* scale) 3625 { 3626 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3627 } 3628 3629 static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, 3630 short* dst, size_t step, Size sz, void* scale) 3631 { 3632 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3633 } 3634 3635 static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, 3636 int* dst, size_t step, Size sz, void* scale) 3637 { 3638 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3639 } 3640 3641 static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, 3642 float* dst, size_t step, Size sz, void* scale) 3643 { 3644 recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3645 } 3646 3647 static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, 3648 double* dst, size_t step, Size sz, void* scale) 3649 { 3650 recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); 3651 } 3652 3653 3654 static BinaryFunc* getMulTab() 3655 { 3656 static BinaryFunc mulTab[] = 3657 { 3658 (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u, 3659 (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f, 3660 (BinaryFunc)mul64f, 0 3661 }; 3662 3663 return mulTab; 3664 } 3665 3666 static BinaryFunc* getDivTab() 3667 { 3668 static BinaryFunc divTab[] = 3669 { 3670 (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u, 3671 (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f, 3672 (BinaryFunc)div64f, 0 3673 }; 3674 3675 return divTab; 3676 } 3677 3678 static BinaryFunc* getRecipTab() 3679 { 3680 static BinaryFunc recipTab[] = 3681 { 3682 (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u, 3683 (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f, 3684 (BinaryFunc)recip64f, 0 3685 }; 3686 3687 return recipTab; 3688 } 3689 3690 } 3691 3692 void cv::multiply(InputArray src1, InputArray src2, 3693 OutputArray dst, double scale, int dtype) 3694 { 3695 arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), 3696 true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE); 3697 } 3698 3699 void cv::divide(InputArray src1, InputArray src2, 3700 OutputArray dst, double scale, int dtype) 3701 { 3702 arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE); 3703 } 3704 3705 void cv::divide(double scale, InputArray src2, 3706 OutputArray dst, int dtype) 3707 { 3708 arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE); 3709 } 3710 3711 /****************************************************************************************\ 3712 * addWeighted * 3713 \****************************************************************************************/ 3714 3715 namespace cv 3716 { 3717 3718 template <typename T, typename WT> 3719 struct AddWeighted_SIMD 3720 { 3721 int operator() (const T *, const T *, T *, int, WT, WT, WT) const 3722 { 3723 return 0; 3724 } 3725 }; 3726 3727 #if CV_SSE2 3728 3729 template <> 3730 struct AddWeighted_SIMD<schar, float> 3731 { 3732 AddWeighted_SIMD() 3733 { 3734 haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 3735 } 3736 3737 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const 3738 { 3739 int x = 0; 3740 3741 if (!haveSSE2) 3742 return x; 3743 3744 __m128i v_zero = _mm_setzero_si128(); 3745 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), 3746 v_gamma = _mm_set1_ps(gamma); 3747 3748 for( ; x <= width - 8; x += 8 ) 3749 { 3750 __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x)); 3751 __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); 3752 3753 __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); 3754 __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); 3755 3756 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha); 3757 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), 3758 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta)); 3759 3760 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha); 3761 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), 3762 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta)); 3763 3764 __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), 3765 _mm_cvtps_epi32(v_dstf1)); 3766 3767 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero)); 3768 } 3769 3770 return x; 3771 } 3772 3773 bool haveSSE2; 3774 }; 3775 3776 template <> 3777 struct AddWeighted_SIMD<short, float> 3778 { 3779 AddWeighted_SIMD() 3780 { 3781 haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 3782 } 3783 3784 int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const 3785 { 3786 int x = 0; 3787 3788 if (!haveSSE2) 3789 return x; 3790 3791 __m128i v_zero = _mm_setzero_si128(); 3792 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), 3793 v_gamma = _mm_set1_ps(gamma); 3794 3795 for( ; x <= width - 8; x += 8 ) 3796 { 3797 __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); 3798 __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); 3799 3800 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha); 3801 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), 3802 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta)); 3803 3804 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha); 3805 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), 3806 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta)); 3807 3808 _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), 3809 _mm_cvtps_epi32(v_dstf1))); 3810 } 3811 3812 return x; 3813 } 3814 3815 bool haveSSE2; 3816 }; 3817 3818 #if CV_SSE4_1 3819 3820 template <> 3821 struct AddWeighted_SIMD<ushort, float> 3822 { 3823 AddWeighted_SIMD() 3824 { 3825 haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); 3826 } 3827 3828 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const 3829 { 3830 int x = 0; 3831 3832 if (!haveSSE4_1) 3833 return x; 3834 3835 __m128i v_zero = _mm_setzero_si128(); 3836 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), 3837 v_gamma = _mm_set1_ps(gamma); 3838 3839 for( ; x <= width - 8; x += 8 ) 3840 { 3841 __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); 3842 __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); 3843 3844 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha); 3845 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), 3846 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta)); 3847 3848 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha); 3849 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), 3850 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta)); 3851 3852 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0), 3853 _mm_cvtps_epi32(v_dstf1))); 3854 } 3855 3856 return x; 3857 } 3858 3859 bool haveSSE4_1; 3860 }; 3861 3862 #endif 3863 3864 #elif CV_NEON 3865 3866 template <> 3867 struct AddWeighted_SIMD<schar, float> 3868 { 3869 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const 3870 { 3871 int x = 0; 3872 3873 float32x4_t g = vdupq_n_f32 (gamma); 3874 3875 for( ; x <= width - 8; x += 8 ) 3876 { 3877 int8x8_t in1 = vld1_s8(src1 + x); 3878 int16x8_t in1_16 = vmovl_s8(in1); 3879 float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16))); 3880 float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16))); 3881 3882 int8x8_t in2 = vld1_s8(src2+x); 3883 int16x8_t in2_16 = vmovl_s8(in2); 3884 float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16))); 3885 float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16))); 3886 3887 float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); 3888 float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); 3889 out_f_l = vaddq_f32(out_f_l, g); 3890 out_f_h = vaddq_f32(out_f_h, g); 3891 3892 int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l)); 3893 int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h)); 3894 3895 int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h); 3896 int8x8_t out = vqmovn_s16(out_16); 3897 3898 vst1_s8(dst + x, out); 3899 } 3900 3901 return x; 3902 } 3903 }; 3904 3905 template <> 3906 struct AddWeighted_SIMD<ushort, float> 3907 { 3908 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const 3909 { 3910 int x = 0; 3911 3912 float32x4_t g = vdupq_n_f32(gamma); 3913 3914 for( ; x <= width - 8; x += 8 ) 3915 { 3916 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); 3917 3918 float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha); 3919 float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta); 3920 uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); 3921 3922 v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha); 3923 v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta); 3924 uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); 3925 3926 vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); 3927 } 3928 3929 return x; 3930 } 3931 }; 3932 3933 template <> 3934 struct AddWeighted_SIMD<short, float> 3935 { 3936 int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const 3937 { 3938 int x = 0; 3939 3940 float32x4_t g = vdupq_n_f32(gamma); 3941 3942 for( ; x <= width - 8; x += 8 ) 3943 { 3944 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); 3945 3946 float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha); 3947 float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta); 3948 int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); 3949 3950 v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha); 3951 v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta); 3952 int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); 3953 3954 vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2)); 3955 } 3956 3957 return x; 3958 } 3959 }; 3960 3961 #endif 3962 3963 template<typename T, typename WT> static void 3964 addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, 3965 T* dst, size_t step, Size size, void* _scalars ) 3966 { 3967 const double* scalars = (const double*)_scalars; 3968 WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2]; 3969 step1 /= sizeof(src1[0]); 3970 step2 /= sizeof(src2[0]); 3971 step /= sizeof(dst[0]); 3972 3973 AddWeighted_SIMD<T, WT> vop; 3974 3975 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 3976 { 3977 int x = vop(src1, src2, dst, size.width, alpha, beta, gamma); 3978 #if CV_ENABLE_UNROLLED 3979 for( ; x <= size.width - 4; x += 4 ) 3980 { 3981 T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma); 3982 T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma); 3983 dst[x] = t0; dst[x+1] = t1; 3984 3985 t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma); 3986 t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma); 3987 dst[x+2] = t0; dst[x+3] = t1; 3988 } 3989 #endif 3990 for( ; x < size.width; x++ ) 3991 dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma); 3992 } 3993 } 3994 3995 3996 static void 3997 addWeighted8u( const uchar* src1, size_t step1, 3998 const uchar* src2, size_t step2, 3999 uchar* dst, size_t step, Size size, 4000 void* _scalars ) 4001 { 4002 const double* scalars = (const double*)_scalars; 4003 float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2]; 4004 4005 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 4006 { 4007 int x = 0; 4008 4009 #if CV_SSE2 4010 if( USE_SSE2 ) 4011 { 4012 __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma); 4013 __m128i z = _mm_setzero_si128(); 4014 4015 for( ; x <= size.width - 8; x += 8 ) 4016 { 4017 __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z); 4018 __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z); 4019 4020 __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z)); 4021 __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z)); 4022 __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z)); 4023 __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z)); 4024 4025 u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4)); 4026 u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4)); 4027 u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4); 4028 4029 u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1)); 4030 u = _mm_packus_epi16(u, u); 4031 4032 _mm_storel_epi64((__m128i*)(dst + x), u); 4033 } 4034 } 4035 #elif CV_NEON 4036 float32x4_t g = vdupq_n_f32 (gamma); 4037 4038 for( ; x <= size.width - 8; x += 8 ) 4039 { 4040 uint8x8_t in1 = vld1_u8(src1+x); 4041 uint16x8_t in1_16 = vmovl_u8(in1); 4042 float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16))); 4043 float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16))); 4044 4045 uint8x8_t in2 = vld1_u8(src2+x); 4046 uint16x8_t in2_16 = vmovl_u8(in2); 4047 float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16))); 4048 float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16))); 4049 4050 float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); 4051 float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); 4052 out_f_l = vaddq_f32(out_f_l, g); 4053 out_f_h = vaddq_f32(out_f_h, g); 4054 4055 uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l)); 4056 uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h)); 4057 4058 uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h); 4059 uint8x8_t out = vqmovn_u16(out_16); 4060 4061 vst1_u8(dst+x, out); 4062 } 4063 #endif 4064 #if CV_ENABLE_UNROLLED 4065 for( ; x <= size.width - 4; x += 4 ) 4066 { 4067 float t0, t1; 4068 t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; 4069 t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma; 4070 4071 dst[x] = saturate_cast<uchar>(t0); 4072 dst[x+1] = saturate_cast<uchar>(t1); 4073 4074 t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma; 4075 t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma; 4076 4077 dst[x+2] = saturate_cast<uchar>(t0); 4078 dst[x+3] = saturate_cast<uchar>(t1); 4079 } 4080 #endif 4081 4082 for( ; x < size.width; x++ ) 4083 { 4084 float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; 4085 dst[x] = saturate_cast<uchar>(t0); 4086 } 4087 } 4088 } 4089 4090 static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, 4091 schar* dst, size_t step, Size sz, void* scalars ) 4092 { 4093 addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, sz, scalars); 4094 } 4095 4096 static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, 4097 ushort* dst, size_t step, Size sz, void* scalars ) 4098 { 4099 addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, sz, scalars); 4100 } 4101 4102 static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, 4103 short* dst, size_t step, Size sz, void* scalars ) 4104 { 4105 addWeighted_<short, float>(src1, step1, src2, step2, dst, step, sz, scalars); 4106 } 4107 4108 static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, 4109 int* dst, size_t step, Size sz, void* scalars ) 4110 { 4111 addWeighted_<int, double>(src1, step1, src2, step2, dst, step, sz, scalars); 4112 } 4113 4114 static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, 4115 float* dst, size_t step, Size sz, void* scalars ) 4116 { 4117 addWeighted_<float, double>(src1, step1, src2, step2, dst, step, sz, scalars); 4118 } 4119 4120 static void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, 4121 double* dst, size_t step, Size sz, void* scalars ) 4122 { 4123 addWeighted_<double, double>(src1, step1, src2, step2, dst, step, sz, scalars); 4124 } 4125 4126 static BinaryFunc* getAddWeightedTab() 4127 { 4128 static BinaryFunc addWeightedTab[] = 4129 { 4130 (BinaryFunc)GET_OPTIMIZED(addWeighted8u), (BinaryFunc)GET_OPTIMIZED(addWeighted8s), (BinaryFunc)GET_OPTIMIZED(addWeighted16u), 4131 (BinaryFunc)GET_OPTIMIZED(addWeighted16s), (BinaryFunc)GET_OPTIMIZED(addWeighted32s), (BinaryFunc)addWeighted32f, 4132 (BinaryFunc)addWeighted64f, 0 4133 }; 4134 4135 return addWeightedTab; 4136 } 4137 4138 } 4139 4140 void cv::addWeighted( InputArray src1, double alpha, InputArray src2, 4141 double beta, double gamma, OutputArray dst, int dtype ) 4142 { 4143 double scalars[] = {alpha, beta, gamma}; 4144 arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW); 4145 } 4146 4147 4148 /****************************************************************************************\ 4149 * compare * 4150 \****************************************************************************************/ 4151 4152 namespace cv 4153 { 4154 4155 template <typename T> 4156 struct Cmp_SIMD 4157 { 4158 explicit Cmp_SIMD(int) 4159 { 4160 } 4161 4162 int operator () (const T *, const T *, uchar *, int) const 4163 { 4164 return 0; 4165 } 4166 }; 4167 4168 #if CV_NEON 4169 4170 template <> 4171 struct Cmp_SIMD<schar> 4172 { 4173 explicit Cmp_SIMD(int code_) : 4174 code(code_) 4175 { 4176 CV_Assert(code == CMP_GT || code == CMP_LE || 4177 code == CMP_EQ || code == CMP_NE); 4178 4179 v_mask = vdupq_n_u8(255); 4180 } 4181 4182 int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const 4183 { 4184 int x = 0; 4185 4186 if (code == CMP_GT) 4187 for ( ; x <= width - 16; x += 16) 4188 vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); 4189 else if (code == CMP_LE) 4190 for ( ; x <= width - 16; x += 16) 4191 vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); 4192 else if (code == CMP_EQ) 4193 for ( ; x <= width - 16; x += 16) 4194 vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); 4195 else if (code == CMP_NE) 4196 for ( ; x <= width - 16; x += 16) 4197 vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask)); 4198 4199 return x; 4200 } 4201 4202 int code; 4203 uint8x16_t v_mask; 4204 }; 4205 4206 template <> 4207 struct Cmp_SIMD<ushort> 4208 { 4209 explicit Cmp_SIMD(int code_) : 4210 code(code_) 4211 { 4212 CV_Assert(code == CMP_GT || code == CMP_LE || 4213 code == CMP_EQ || code == CMP_NE); 4214 4215 v_mask = vdup_n_u8(255); 4216 } 4217 4218 int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const 4219 { 4220 int x = 0; 4221 4222 if (code == CMP_GT) 4223 for ( ; x <= width - 8; x += 8) 4224 { 4225 uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); 4226 vst1_u8(dst + x, vmovn_u16(v_dst)); 4227 } 4228 else if (code == CMP_LE) 4229 for ( ; x <= width - 8; x += 8) 4230 { 4231 uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); 4232 vst1_u8(dst + x, vmovn_u16(v_dst)); 4233 } 4234 else if (code == CMP_EQ) 4235 for ( ; x <= width - 8; x += 8) 4236 { 4237 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); 4238 vst1_u8(dst + x, vmovn_u16(v_dst)); 4239 } 4240 else if (code == CMP_NE) 4241 for ( ; x <= width - 8; x += 8) 4242 { 4243 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); 4244 vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask)); 4245 } 4246 4247 return x; 4248 } 4249 4250 int code; 4251 uint8x8_t v_mask; 4252 }; 4253 4254 template <> 4255 struct Cmp_SIMD<int> 4256 { 4257 explicit Cmp_SIMD(int code_) : 4258 code(code_) 4259 { 4260 CV_Assert(code == CMP_GT || code == CMP_LE || 4261 code == CMP_EQ || code == CMP_NE); 4262 4263 v_mask = vdup_n_u8(255); 4264 } 4265 4266 int operator () (const int * src1, const int * src2, uchar * dst, int width) const 4267 { 4268 int x = 0; 4269 4270 if (code == CMP_GT) 4271 for ( ; x <= width - 8; x += 8) 4272 { 4273 uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); 4274 uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); 4275 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 4276 } 4277 else if (code == CMP_LE) 4278 for ( ; x <= width - 8; x += 8) 4279 { 4280 uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); 4281 uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); 4282 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 4283 } 4284 else if (code == CMP_EQ) 4285 for ( ; x <= width - 8; x += 8) 4286 { 4287 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); 4288 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); 4289 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 4290 } 4291 else if (code == CMP_NE) 4292 for ( ; x <= width - 8; x += 8) 4293 { 4294 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); 4295 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); 4296 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); 4297 vst1_u8(dst + x, veor_u8(v_dst, v_mask)); 4298 } 4299 4300 return x; 4301 } 4302 4303 int code; 4304 uint8x8_t v_mask; 4305 }; 4306 4307 template <> 4308 struct Cmp_SIMD<float> 4309 { 4310 explicit Cmp_SIMD(int code_) : 4311 code(code_) 4312 { 4313 CV_Assert(code == CMP_GT || code == CMP_LE || 4314 code == CMP_EQ || code == CMP_NE); 4315 4316 v_mask = vdup_n_u8(255); 4317 } 4318 4319 int operator () (const float * src1, const float * src2, uchar * dst, int width) const 4320 { 4321 int x = 0; 4322 4323 if (code == CMP_GT) 4324 for ( ; x <= width - 8; x += 8) 4325 { 4326 uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 4327 uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 4328 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 4329 } 4330 else if (code == CMP_LE) 4331 for ( ; x <= width - 8; x += 8) 4332 { 4333 uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 4334 uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 4335 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 4336 } 4337 else if (code == CMP_EQ) 4338 for ( ; x <= width - 8; x += 8) 4339 { 4340 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 4341 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 4342 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); 4343 } 4344 else if (code == CMP_NE) 4345 for ( ; x <= width - 8; x += 8) 4346 { 4347 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); 4348 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); 4349 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); 4350 vst1_u8(dst + x, veor_u8(v_dst, v_mask)); 4351 } 4352 4353 return x; 4354 } 4355 4356 int code; 4357 uint8x8_t v_mask; 4358 }; 4359 4360 #elif CV_SSE2 4361 4362 template <> 4363 struct Cmp_SIMD<schar> 4364 { 4365 explicit Cmp_SIMD(int code_) : 4366 code(code_) 4367 { 4368 CV_Assert(code == CMP_GT || code == CMP_LE || 4369 code == CMP_EQ || code == CMP_NE); 4370 4371 haveSSE = checkHardwareSupport(CV_CPU_SSE2); 4372 4373 v_mask = _mm_set1_epi8(-1); 4374 } 4375 4376 int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const 4377 { 4378 int x = 0; 4379 4380 if (!haveSSE) 4381 return x; 4382 4383 if (code == CMP_GT) 4384 for ( ; x <= width - 16; x += 16) 4385 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), 4386 _mm_loadu_si128((const __m128i *)(src2 + x)))); 4387 else if (code == CMP_LE) 4388 for ( ; x <= width - 16; x += 16) 4389 { 4390 __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), 4391 _mm_loadu_si128((const __m128i *)(src2 + x))); 4392 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); 4393 } 4394 else if (code == CMP_EQ) 4395 for ( ; x <= width - 16; x += 16) 4396 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), 4397 _mm_loadu_si128((const __m128i *)(src2 + x)))); 4398 else if (code == CMP_NE) 4399 for ( ; x <= width - 16; x += 16) 4400 { 4401 __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), 4402 _mm_loadu_si128((const __m128i *)(src2 + x))); 4403 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); 4404 } 4405 4406 return x; 4407 } 4408 4409 int code; 4410 __m128i v_mask; 4411 bool haveSSE; 4412 }; 4413 4414 template <> 4415 struct Cmp_SIMD<int> 4416 { 4417 explicit Cmp_SIMD(int code_) : 4418 code(code_) 4419 { 4420 CV_Assert(code == CMP_GT || code == CMP_LE || 4421 code == CMP_EQ || code == CMP_NE); 4422 4423 haveSSE = checkHardwareSupport(CV_CPU_SSE2); 4424 4425 v_mask = _mm_set1_epi32(0xffffffff); 4426 } 4427 4428 int operator () (const int * src1, const int * src2, uchar * dst, int width) const 4429 { 4430 int x = 0; 4431 4432 if (!haveSSE) 4433 return x; 4434 4435 if (code == CMP_GT) 4436 for ( ; x <= width - 8; x += 8) 4437 { 4438 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), 4439 _mm_loadu_si128((const __m128i *)(src2 + x))); 4440 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), 4441 _mm_loadu_si128((const __m128i *)(src2 + x + 4))); 4442 4443 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); 4444 } 4445 else if (code == CMP_LE) 4446 for ( ; x <= width - 8; x += 8) 4447 { 4448 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), 4449 _mm_loadu_si128((const __m128i *)(src2 + x))); 4450 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), 4451 _mm_loadu_si128((const __m128i *)(src2 + x + 4))); 4452 4453 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); 4454 } 4455 else if (code == CMP_EQ) 4456 for ( ; x <= width - 8; x += 8) 4457 { 4458 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), 4459 _mm_loadu_si128((const __m128i *)(src2 + x))); 4460 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), 4461 _mm_loadu_si128((const __m128i *)(src2 + x + 4))); 4462 4463 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); 4464 } 4465 else if (code == CMP_NE) 4466 for ( ; x <= width - 8; x += 8) 4467 { 4468 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), 4469 _mm_loadu_si128((const __m128i *)(src2 + x))); 4470 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), 4471 _mm_loadu_si128((const __m128i *)(src2 + x + 4))); 4472 4473 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); 4474 } 4475 4476 return x; 4477 } 4478 4479 int code; 4480 __m128i v_mask; 4481 bool haveSSE; 4482 }; 4483 4484 #endif 4485 4486 template<typename T> static void 4487 cmp_(const T* src1, size_t step1, const T* src2, size_t step2, 4488 uchar* dst, size_t step, Size size, int code) 4489 { 4490 step1 /= sizeof(src1[0]); 4491 step2 /= sizeof(src2[0]); 4492 if( code == CMP_GE || code == CMP_LT ) 4493 { 4494 std::swap(src1, src2); 4495 std::swap(step1, step2); 4496 code = code == CMP_GE ? CMP_LE : CMP_GT; 4497 } 4498 4499 Cmp_SIMD<T> vop(code); 4500 4501 if( code == CMP_GT || code == CMP_LE ) 4502 { 4503 int m = code == CMP_GT ? 0 : 255; 4504 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 4505 { 4506 int x = vop(src1, src2, dst, size.width); 4507 #if CV_ENABLE_UNROLLED 4508 for( ; x <= size.width - 4; x += 4 ) 4509 { 4510 int t0, t1; 4511 t0 = -(src1[x] > src2[x]) ^ m; 4512 t1 = -(src1[x+1] > src2[x+1]) ^ m; 4513 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; 4514 t0 = -(src1[x+2] > src2[x+2]) ^ m; 4515 t1 = -(src1[x+3] > src2[x+3]) ^ m; 4516 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; 4517 } 4518 #endif 4519 for( ; x < size.width; x++ ) 4520 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); 4521 } 4522 } 4523 else if( code == CMP_EQ || code == CMP_NE ) 4524 { 4525 int m = code == CMP_EQ ? 0 : 255; 4526 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 4527 { 4528 int x = 0; 4529 #if CV_ENABLE_UNROLLED 4530 for( ; x <= size.width - 4; x += 4 ) 4531 { 4532 int t0, t1; 4533 t0 = -(src1[x] == src2[x]) ^ m; 4534 t1 = -(src1[x+1] == src2[x+1]) ^ m; 4535 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; 4536 t0 = -(src1[x+2] == src2[x+2]) ^ m; 4537 t1 = -(src1[x+3] == src2[x+3]) ^ m; 4538 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; 4539 } 4540 #endif 4541 for( ; x < size.width; x++ ) 4542 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); 4543 } 4544 } 4545 } 4546 4547 #if ARITHM_USE_IPP 4548 inline static IppCmpOp convert_cmp(int _cmpop) 4549 { 4550 return _cmpop == CMP_EQ ? ippCmpEq : 4551 _cmpop == CMP_GT ? ippCmpGreater : 4552 _cmpop == CMP_GE ? ippCmpGreaterEq : 4553 _cmpop == CMP_LT ? ippCmpLess : 4554 _cmpop == CMP_LE ? ippCmpLessEq : 4555 (IppCmpOp)-1; 4556 } 4557 #endif 4558 4559 static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, 4560 uchar* dst, size_t step, Size size, void* _cmpop) 4561 { 4562 #if ARITHM_USE_IPP 4563 CV_IPP_CHECK() 4564 { 4565 IppCmpOp op = convert_cmp(*(int *)_cmpop); 4566 if( op >= 0 ) 4567 { 4568 fixSteps(size, sizeof(dst[0]), step1, step2, step); 4569 if (0 <= ippiCompare_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) 4570 { 4571 CV_IMPL_ADD(CV_IMPL_IPP); 4572 return; 4573 } 4574 setIppErrorStatus(); 4575 } 4576 } 4577 #endif 4578 //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); 4579 int code = *(int*)_cmpop; 4580 step1 /= sizeof(src1[0]); 4581 step2 /= sizeof(src2[0]); 4582 if( code == CMP_GE || code == CMP_LT ) 4583 { 4584 std::swap(src1, src2); 4585 std::swap(step1, step2); 4586 code = code == CMP_GE ? CMP_LE : CMP_GT; 4587 } 4588 4589 if( code == CMP_GT || code == CMP_LE ) 4590 { 4591 int m = code == CMP_GT ? 0 : 255; 4592 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 4593 { 4594 int x =0; 4595 #if CV_SSE2 4596 if( USE_SSE2 ) 4597 { 4598 __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1); 4599 __m128i c128 = _mm_set1_epi8 (-128); 4600 for( ; x <= size.width - 16; x += 16 ) 4601 { 4602 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); 4603 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); 4604 // no simd for 8u comparison, that's why we need the trick 4605 r00 = _mm_sub_epi8(r00,c128); 4606 r10 = _mm_sub_epi8(r10,c128); 4607 4608 r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128); 4609 _mm_storeu_si128((__m128i*)(dst + x),r00); 4610 4611 } 4612 } 4613 #elif CV_NEON 4614 uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); 4615 4616 for( ; x <= size.width - 16; x += 16 ) 4617 { 4618 vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); 4619 } 4620 4621 #endif 4622 4623 for( ; x < size.width; x++ ){ 4624 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); 4625 } 4626 } 4627 } 4628 else if( code == CMP_EQ || code == CMP_NE ) 4629 { 4630 int m = code == CMP_EQ ? 0 : 255; 4631 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 4632 { 4633 int x = 0; 4634 #if CV_SSE2 4635 if( USE_SSE2 ) 4636 { 4637 __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1); 4638 for( ; x <= size.width - 16; x += 16 ) 4639 { 4640 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); 4641 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); 4642 r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128); 4643 _mm_storeu_si128((__m128i*)(dst + x), r00); 4644 } 4645 } 4646 #elif CV_NEON 4647 uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); 4648 4649 for( ; x <= size.width - 16; x += 16 ) 4650 { 4651 vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); 4652 } 4653 #endif 4654 for( ; x < size.width; x++ ) 4655 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); 4656 } 4657 } 4658 } 4659 4660 static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, 4661 uchar* dst, size_t step, Size size, void* _cmpop) 4662 { 4663 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); 4664 } 4665 4666 static void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, 4667 uchar* dst, size_t step, Size size, void* _cmpop) 4668 { 4669 #if ARITHM_USE_IPP 4670 CV_IPP_CHECK() 4671 { 4672 IppCmpOp op = convert_cmp(*(int *)_cmpop); 4673 if( op >= 0 ) 4674 { 4675 fixSteps(size, sizeof(dst[0]), step1, step2, step); 4676 if (0 <= ippiCompare_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) 4677 { 4678 CV_IMPL_ADD(CV_IMPL_IPP); 4679 return; 4680 } 4681 setIppErrorStatus(); 4682 } 4683 } 4684 #endif 4685 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); 4686 } 4687 4688 static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, 4689 uchar* dst, size_t step, Size size, void* _cmpop) 4690 { 4691 #if ARITHM_USE_IPP 4692 CV_IPP_CHECK() 4693 { 4694 IppCmpOp op = convert_cmp(*(int *)_cmpop); 4695 if( op > 0 ) 4696 { 4697 fixSteps(size, sizeof(dst[0]), step1, step2, step); 4698 if (0 <= ippiCompare_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) 4699 { 4700 CV_IMPL_ADD(CV_IMPL_IPP); 4701 return; 4702 } 4703 setIppErrorStatus(); 4704 } 4705 } 4706 #endif 4707 //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); 4708 4709 int code = *(int*)_cmpop; 4710 step1 /= sizeof(src1[0]); 4711 step2 /= sizeof(src2[0]); 4712 if( code == CMP_GE || code == CMP_LT ) 4713 { 4714 std::swap(src1, src2); 4715 std::swap(step1, step2); 4716 code = code == CMP_GE ? CMP_LE : CMP_GT; 4717 } 4718 4719 if( code == CMP_GT || code == CMP_LE ) 4720 { 4721 int m = code == CMP_GT ? 0 : 255; 4722 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 4723 { 4724 int x =0; 4725 #if CV_SSE2 4726 if( USE_SSE2) 4727 { 4728 __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1); 4729 for( ; x <= size.width - 16; x += 16 ) 4730 { 4731 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); 4732 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); 4733 r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); 4734 __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); 4735 __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); 4736 r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128); 4737 r11 = _mm_packs_epi16(r00, r01); 4738 _mm_storeu_si128((__m128i*)(dst + x), r11); 4739 } 4740 if( x <= size.width-8) 4741 { 4742 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); 4743 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); 4744 r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); 4745 r10 = _mm_packs_epi16(r00, r00); 4746 _mm_storel_epi64((__m128i*)(dst + x), r10); 4747 4748 x += 8; 4749 } 4750 } 4751 #elif CV_NEON 4752 uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); 4753 4754 for( ; x <= size.width - 16; x += 16 ) 4755 { 4756 int16x8_t in1 = vld1q_s16(src1 + x); 4757 int16x8_t in2 = vld1q_s16(src2 + x); 4758 uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2)); 4759 4760 in1 = vld1q_s16(src1 + x + 8); 4761 in2 = vld1q_s16(src2 + x + 8); 4762 uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2)); 4763 4764 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); 4765 } 4766 #endif 4767 4768 for( ; x < size.width; x++ ){ 4769 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); 4770 } 4771 } 4772 } 4773 else if( code == CMP_EQ || code == CMP_NE ) 4774 { 4775 int m = code == CMP_EQ ? 0 : 255; 4776 for( ; size.height--; src1 += step1, src2 += step2, dst += step ) 4777 { 4778 int x = 0; 4779 #if CV_SSE2 4780 if( USE_SSE2 ) 4781 { 4782 __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1); 4783 for( ; x <= size.width - 16; x += 16 ) 4784 { 4785 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); 4786 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); 4787 r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); 4788 __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); 4789 __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); 4790 r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128); 4791 r11 = _mm_packs_epi16(r00, r01); 4792 _mm_storeu_si128((__m128i*)(dst + x), r11); 4793 } 4794 if( x <= size.width - 8) 4795 { 4796 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); 4797 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); 4798 r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); 4799 r10 = _mm_packs_epi16(r00, r00); 4800 _mm_storel_epi64((__m128i*)(dst + x), r10); 4801 4802 x += 8; 4803 } 4804 } 4805 #elif CV_NEON 4806 uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); 4807 4808 for( ; x <= size.width - 16; x += 16 ) 4809 { 4810 int16x8_t in1 = vld1q_s16(src1 + x); 4811 int16x8_t in2 = vld1q_s16(src2 + x); 4812 uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2)); 4813 4814 in1 = vld1q_s16(src1 + x + 8); 4815 in2 = vld1q_s16(src2 + x + 8); 4816 uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2)); 4817 4818 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); 4819 } 4820 #endif 4821 for( ; x < size.width; x++ ) 4822 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); 4823 } 4824 } 4825 } 4826 4827 static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, 4828 uchar* dst, size_t step, Size size, void* _cmpop) 4829 { 4830 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); 4831 } 4832 4833 static void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, 4834 uchar* dst, size_t step, Size size, void* _cmpop) 4835 { 4836 #if ARITHM_USE_IPP 4837 CV_IPP_CHECK() 4838 { 4839 IppCmpOp op = convert_cmp(*(int *)_cmpop); 4840 if( op >= 0 ) 4841 { 4842 fixSteps(size, sizeof(dst[0]), step1, step2, step); 4843 if (0 <= ippiCompare_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) 4844 { 4845 CV_IMPL_ADD(CV_IMPL_IPP); 4846 return; 4847 } 4848 setIppErrorStatus(); 4849 } 4850 } 4851 #endif 4852 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); 4853 } 4854 4855 static void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, 4856 uchar* dst, size_t step, Size size, void* _cmpop) 4857 { 4858 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); 4859 } 4860 4861 static BinaryFunc getCmpFunc(int depth) 4862 { 4863 static BinaryFunc cmpTab[] = 4864 { 4865 (BinaryFunc)GET_OPTIMIZED(cmp8u), (BinaryFunc)GET_OPTIMIZED(cmp8s), 4866 (BinaryFunc)GET_OPTIMIZED(cmp16u), (BinaryFunc)GET_OPTIMIZED(cmp16s), 4867 (BinaryFunc)GET_OPTIMIZED(cmp32s), 4868 (BinaryFunc)GET_OPTIMIZED(cmp32f), (BinaryFunc)cmp64f, 4869 0 4870 }; 4871 4872 return cmpTab[depth]; 4873 } 4874 4875 static double getMinVal(int depth) 4876 { 4877 static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0}; 4878 return tab[depth]; 4879 } 4880 4881 static double getMaxVal(int depth) 4882 { 4883 static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0}; 4884 return tab[depth]; 4885 } 4886 4887 #ifdef HAVE_OPENCL 4888 4889 static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op, bool haveScalar) 4890 { 4891 const ocl::Device& dev = ocl::Device::getDefault(); 4892 bool doubleSupport = dev.doubleFPConfig() > 0; 4893 int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1), 4894 type2 = _src2.type(), depth2 = CV_MAT_DEPTH(type2); 4895 4896 if (!doubleSupport && depth1 == CV_64F) 4897 return false; 4898 4899 if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2)) 4900 return false; 4901 4902 int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1; 4903 // Workaround for bug with "?:" operator in AMD OpenCL compiler 4904 if (depth1 >= CV_16U) 4905 kercn = 1; 4906 4907 int scalarcn = kercn == 3 ? 4 : kercn; 4908 const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" }; 4909 char cvt[40]; 4910 4911 String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d" 4912 " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s" 4913 " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s", 4914 haveScalar ? "UNARY_OP" : "BINARY_OP", 4915 ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)), 4916 ocl::typeToStr(CV_8UC(kercn)), kercn, 4917 ocl::convertTypeStr(depth1, CV_8U, kercn, cvt), 4918 operationMap[op], ocl::typeToStr(depth1), 4919 ocl::typeToStr(depth1), ocl::typeToStr(CV_8U), 4920 ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI, 4921 doubleSupport ? " -D DOUBLE_SUPPORT" : ""); 4922 4923 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); 4924 if (k.empty()) 4925 return false; 4926 4927 UMat src1 = _src1.getUMat(); 4928 Size size = src1.size(); 4929 _dst.create(size, CV_8UC(cn)); 4930 UMat dst = _dst.getUMat(); 4931 4932 if (haveScalar) 4933 { 4934 size_t esz = CV_ELEM_SIZE1(type1) * scalarcn; 4935 double buf[4] = { 0, 0, 0, 0 }; 4936 Mat src2 = _src2.getMat(); 4937 4938 if( depth1 > CV_32S ) 4939 convertAndUnrollScalar( src2, depth1, (uchar *)buf, kercn ); 4940 else 4941 { 4942 double fval = 0; 4943 getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar *)&fval, 1, Size(1, 1), 0); 4944 if( fval < getMinVal(depth1) ) 4945 return dst.setTo(Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0)), true; 4946 4947 if( fval > getMaxVal(depth1) ) 4948 return dst.setTo(Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0)), true; 4949 4950 int ival = cvRound(fval); 4951 if( fval != ival ) 4952 { 4953 if( op == CMP_LT || op == CMP_GE ) 4954 ival = cvCeil(fval); 4955 else if( op == CMP_LE || op == CMP_GT ) 4956 ival = cvFloor(fval); 4957 else 4958 return dst.setTo(Scalar::all(op == CMP_NE ? 255 : 0)), true; 4959 } 4960 convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, (uchar *)buf, kercn); 4961 } 4962 4963 ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); 4964 4965 k.args(ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn), 4966 ocl::KernelArg::WriteOnly(dst, cn, kercn), scalararg); 4967 } 4968 else 4969 { 4970 UMat src2 = _src2.getUMat(); 4971 4972 k.args(ocl::KernelArg::ReadOnlyNoSize(src1), 4973 ocl::KernelArg::ReadOnlyNoSize(src2), 4974 ocl::KernelArg::WriteOnly(dst, cn, kercn)); 4975 } 4976 4977 size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI }; 4978 return k.run(2, globalsize, NULL, false); 4979 } 4980 4981 #endif 4982 4983 } 4984 4985 void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op) 4986 { 4987 CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ || 4988 op == CMP_NE || op == CMP_GE || op == CMP_GT ); 4989 4990 bool haveScalar = false; 4991 4992 if ((_src1.isMatx() + _src2.isMatx()) == 1 4993 || !_src1.sameSize(_src2) 4994 || _src1.type() != _src2.type()) 4995 { 4996 if (checkScalar(_src1, _src2.type(), _src1.kind(), _src2.kind())) 4997 { 4998 op = op == CMP_LT ? CMP_GT : op == CMP_LE ? CMP_GE : 4999 op == CMP_GE ? CMP_LE : op == CMP_GT ? CMP_LT : op; 5000 // src1 is a scalar; swap it with src2 5001 compare(_src2, _src1, _dst, op); 5002 return; 5003 } 5004 else if( !checkScalar(_src2, _src1.type(), _src2.kind(), _src1.kind()) ) 5005 CV_Error( CV_StsUnmatchedSizes, 5006 "The operation is neither 'array op array' (where arrays have the same size and the same type), " 5007 "nor 'array op scalar', nor 'scalar op array'" ); 5008 haveScalar = true; 5009 } 5010 5011 CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()), 5012 ocl_compare(_src1, _src2, _dst, op, haveScalar)) 5013 5014 int kind1 = _src1.kind(), kind2 = _src2.kind(); 5015 Mat src1 = _src1.getMat(), src2 = _src2.getMat(); 5016 5017 if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() ) 5018 { 5019 int cn = src1.channels(); 5020 _dst.create(src1.size(), CV_8UC(cn)); 5021 Mat dst = _dst.getMat(); 5022 Size sz = getContinuousSize(src1, src2, dst, src1.channels()); 5023 getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, &op); 5024 return; 5025 } 5026 5027 int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth(); 5028 5029 _dst.create(src1.dims, src1.size, CV_8UC(cn)); 5030 src1 = src1.reshape(1); src2 = src2.reshape(1); 5031 Mat dst = _dst.getMat().reshape(1); 5032 5033 size_t esz = src1.elemSize(); 5034 size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz; 5035 BinaryFunc func = getCmpFunc(depth1); 5036 5037 if( !haveScalar ) 5038 { 5039 const Mat* arrays[] = { &src1, &src2, &dst, 0 }; 5040 uchar* ptrs[3]; 5041 5042 NAryMatIterator it(arrays, ptrs); 5043 size_t total = it.size; 5044 5045 for( size_t i = 0; i < it.nplanes; i++, ++it ) 5046 func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op ); 5047 } 5048 else 5049 { 5050 const Mat* arrays[] = { &src1, &dst, 0 }; 5051 uchar* ptrs[2]; 5052 5053 NAryMatIterator it(arrays, ptrs); 5054 size_t total = it.size, blocksize = std::min(total, blocksize0); 5055 5056 AutoBuffer<uchar> _buf(blocksize*esz); 5057 uchar *buf = _buf; 5058 5059 if( depth1 > CV_32S ) 5060 convertAndUnrollScalar( src2, depth1, buf, blocksize ); 5061 else 5062 { 5063 double fval=0; 5064 getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar*)&fval, 1, Size(1,1), 0); 5065 if( fval < getMinVal(depth1) ) 5066 { 5067 dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0); 5068 return; 5069 } 5070 5071 if( fval > getMaxVal(depth1) ) 5072 { 5073 dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0); 5074 return; 5075 } 5076 5077 int ival = cvRound(fval); 5078 if( fval != ival ) 5079 { 5080 if( op == CMP_LT || op == CMP_GE ) 5081 ival = cvCeil(fval); 5082 else if( op == CMP_LE || op == CMP_GT ) 5083 ival = cvFloor(fval); 5084 else 5085 { 5086 dst = Scalar::all(op == CMP_NE ? 255 : 0); 5087 return; 5088 } 5089 } 5090 convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize); 5091 } 5092 5093 for( size_t i = 0; i < it.nplanes; i++, ++it ) 5094 { 5095 for( size_t j = 0; j < total; j += blocksize ) 5096 { 5097 int bsz = (int)MIN(total - j, blocksize); 5098 func( ptrs[0], 0, buf, 0, ptrs[1], 0, Size(bsz, 1), &op); 5099 ptrs[0] += bsz*esz; 5100 ptrs[1] += bsz; 5101 } 5102 } 5103 } 5104 } 5105 5106 /****************************************************************************************\ 5107 * inRange * 5108 \****************************************************************************************/ 5109 5110 namespace cv 5111 { 5112 5113 template <typename T> 5114 struct InRange_SIMD 5115 { 5116 int operator () (const T *, const T *, const T *, uchar *, int) const 5117 { 5118 return 0; 5119 } 5120 }; 5121 5122 #if CV_SSE2 5123 5124 template <> 5125 struct InRange_SIMD<uchar> 5126 { 5127 int operator () (const uchar * src1, const uchar * src2, const uchar * src3, 5128 uchar * dst, int len) const 5129 { 5130 int x = 0; 5131 5132 if (USE_SSE2) 5133 { 5134 __m128i v_full = _mm_set1_epi8(-1), v_128 = _mm_set1_epi8(-128); 5135 5136 for ( ; x <= len - 16; x += 16 ) 5137 { 5138 __m128i v_src = _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), v_128); 5139 __m128i v_mask1 = _mm_cmpgt_epi8(_mm_add_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_128), v_src); 5140 __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src3 + x)), v_128)); 5141 _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full)); 5142 } 5143 } 5144 5145 return x; 5146 } 5147 }; 5148 5149 template <> 5150 struct InRange_SIMD<schar> 5151 { 5152 int operator () (const schar * src1, const schar * src2, const schar * src3, 5153 uchar * dst, int len) const 5154 { 5155 int x = 0; 5156 5157 if (USE_SSE2) 5158 { 5159 __m128i v_full = _mm_set1_epi8(-1); 5160 5161 for ( ; x <= len - 16; x += 16 ) 5162 { 5163 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x)); 5164 __m128i v_mask1 = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src); 5165 __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))); 5166 _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full)); 5167 } 5168 } 5169 5170 return x; 5171 } 5172 }; 5173 5174 template <> 5175 struct InRange_SIMD<ushort> 5176 { 5177 int operator () (const ushort * src1, const ushort * src2, const ushort * src3, 5178 uchar * dst, int len) const 5179 { 5180 int x = 0; 5181 5182 if (USE_SSE2) 5183 { 5184 __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1), v_32768 = _mm_set1_epi16(-32768); 5185 5186 for ( ; x <= len - 8; x += 8 ) 5187 { 5188 __m128i v_src = _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src1 + x)), v_32768); 5189 __m128i v_mask1 = _mm_cmpgt_epi16(_mm_add_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_32768), v_src); 5190 __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src3 + x)), v_32768)); 5191 __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full); 5192 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero)); 5193 } 5194 } 5195 5196 return x; 5197 } 5198 }; 5199 5200 template <> 5201 struct InRange_SIMD<short> 5202 { 5203 int operator () (const short * src1, const short * src2, const short * src3, 5204 uchar * dst, int len) const 5205 { 5206 int x = 0; 5207 5208 if (USE_SSE2) 5209 { 5210 __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1); 5211 5212 for ( ; x <= len - 8; x += 8 ) 5213 { 5214 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x)); 5215 __m128i v_mask1 = _mm_cmpgt_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src); 5216 __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))); 5217 __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full); 5218 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero)); 5219 } 5220 } 5221 5222 return x; 5223 } 5224 }; 5225 5226 template <> 5227 struct InRange_SIMD<int> 5228 { 5229 int operator () (const int * src1, const int * src2, const int * src3, 5230 uchar * dst, int len) const 5231 { 5232 int x = 0; 5233 5234 if (USE_SSE2) 5235 { 5236 __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi32(-1); 5237 5238 for ( ; x <= len - 8; x += 8 ) 5239 { 5240 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x)); 5241 __m128i v_res1 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src), 5242 _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)))); 5243 5244 v_src = _mm_loadu_si128((const __m128i *)(src1 + x + 4)); 5245 __m128i v_res2 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x + 4)), v_src), 5246 _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x + 4)))); 5247 5248 __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(_mm_andnot_si128(v_res1, v_full), 16), 5249 _mm_srli_epi32(_mm_andnot_si128(v_res2, v_full), 16)); 5250 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero)); 5251 } 5252 } 5253 5254 return x; 5255 } 5256 }; 5257 5258 template <> 5259 struct InRange_SIMD<float> 5260 { 5261 int operator () (const float * src1, const float * src2, const float * src3, 5262 uchar * dst, int len) const 5263 { 5264 int x = 0; 5265 5266 if (USE_SSE2) 5267 { 5268 __m128i v_zero = _mm_setzero_si128(); 5269 5270 for ( ; x <= len - 8; x += 8 ) 5271 { 5272 __m128 v_src = _mm_loadu_ps(src1 + x); 5273 __m128 v_res1 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x), v_src), 5274 _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x))); 5275 5276 v_src = _mm_loadu_ps(src1 + x + 4); 5277 __m128 v_res2 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x + 4), v_src), 5278 _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x + 4))); 5279 5280 __m128i v_res1i = _mm_cvtps_epi32(v_res1), v_res2i = _mm_cvtps_epi32(v_res2); 5281 __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(v_res1i, 16), _mm_srli_epi32(v_res2i, 16)); 5282 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero)); 5283 } 5284 } 5285 5286 return x; 5287 } 5288 }; 5289 5290 #elif CV_NEON 5291 5292 template <> 5293 struct InRange_SIMD<uchar> 5294 { 5295 int operator () (const uchar * src1, const uchar * src2, const uchar * src3, 5296 uchar * dst, int len) const 5297 { 5298 int x = 0; 5299 5300 for ( ; x <= len - 16; x += 16 ) 5301 { 5302 uint8x16_t values = vld1q_u8(src1 + x); 5303 uint8x16_t low = vld1q_u8(src2 + x); 5304 uint8x16_t high = vld1q_u8(src3 + x); 5305 5306 vst1q_u8(dst + x, vandq_u8(vcgeq_u8(values, low), vcgeq_u8(high, values))); 5307 } 5308 return x; 5309 } 5310 }; 5311 5312 template <> 5313 struct InRange_SIMD<schar> 5314 { 5315 int operator () (const schar * src1, const schar * src2, const schar * src3, 5316 uchar * dst, int len) const 5317 { 5318 int x = 0; 5319 5320 for ( ; x <= len - 16; x += 16 ) 5321 { 5322 int8x16_t values = vld1q_s8(src1 + x); 5323 int8x16_t low = vld1q_s8(src2 + x); 5324 int8x16_t high = vld1q_s8(src3 + x); 5325 5326 vst1q_u8(dst + x, vandq_u8(vcgeq_s8(values, low), vcgeq_s8(high, values))); 5327 } 5328 return x; 5329 } 5330 }; 5331 5332 template <> 5333 struct InRange_SIMD<ushort> 5334 { 5335 int operator () (const ushort * src1, const ushort * src2, const ushort * src3, 5336 uchar * dst, int len) const 5337 { 5338 int x = 0; 5339 5340 for ( ; x <= len - 16; x += 16 ) 5341 { 5342 uint16x8_t values = vld1q_u16((const uint16_t*)(src1 + x)); 5343 uint16x8_t low = vld1q_u16((const uint16_t*)(src2 + x)); 5344 uint16x8_t high = vld1q_u16((const uint16_t*)(src3 + x)); 5345 uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values))); 5346 5347 values = vld1q_u16((const uint16_t*)(src1 + x + 8)); 5348 low = vld1q_u16((const uint16_t*)(src2 + x + 8)); 5349 high = vld1q_u16((const uint16_t*)(src3 + x + 8)); 5350 uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values))); 5351 5352 vst1q_u8(dst + x, vcombine_u8(r1, r2)); 5353 } 5354 return x; 5355 } 5356 }; 5357 5358 template <> 5359 struct InRange_SIMD<short> 5360 { 5361 int operator () (const short * src1, const short * src2, const short * src3, 5362 uchar * dst, int len) const 5363 { 5364 int x = 0; 5365 5366 for ( ; x <= len - 16; x += 16 ) 5367 { 5368 int16x8_t values = vld1q_s16((const int16_t*)(src1 + x)); 5369 int16x8_t low = vld1q_s16((const int16_t*)(src2 + x)); 5370 int16x8_t high = vld1q_s16((const int16_t*)(src3 + x)); 5371 uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values))); 5372 5373 values = vld1q_s16((const int16_t*)(src1 + x + 8)); 5374 low = vld1q_s16((const int16_t*)(src2 + x + 8)); 5375 high = vld1q_s16((const int16_t*)(src3 + x + 8)); 5376 uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values))); 5377 5378 vst1q_u8(dst + x, vcombine_u8(r1, r2)); 5379 } 5380 return x; 5381 } 5382 }; 5383 5384 template <> 5385 struct InRange_SIMD<int> 5386 { 5387 int operator () (const int * src1, const int * src2, const int * src3, 5388 uchar * dst, int len) const 5389 { 5390 int x = 0; 5391 5392 for ( ; x <= len - 8; x += 8 ) 5393 { 5394 int32x4_t values = vld1q_s32((const int32_t*)(src1 + x)); 5395 int32x4_t low = vld1q_s32((const int32_t*)(src2 + x)); 5396 int32x4_t high = vld1q_s32((const int32_t*)(src3 + x)); 5397 5398 uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values))); 5399 5400 values = vld1q_s32((const int32_t*)(src1 + x + 4)); 5401 low = vld1q_s32((const int32_t*)(src2 + x + 4)); 5402 high = vld1q_s32((const int32_t*)(src3 + x + 4)); 5403 5404 uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values))); 5405 5406 uint16x8_t res_16 = vcombine_u16(r1, r2); 5407 5408 vst1_u8(dst + x, vmovn_u16(res_16)); 5409 } 5410 return x; 5411 } 5412 }; 5413 5414 template <> 5415 struct InRange_SIMD<float> 5416 { 5417 int operator () (const float * src1, const float * src2, const float * src3, 5418 uchar * dst, int len) const 5419 { 5420 int x = 0; 5421 5422 for ( ; x <= len - 8; x += 8 ) 5423 { 5424 float32x4_t values = vld1q_f32((const float32_t*)(src1 + x)); 5425 float32x4_t low = vld1q_f32((const float32_t*)(src2 + x)); 5426 float32x4_t high = vld1q_f32((const float32_t*)(src3 + x)); 5427 5428 uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values))); 5429 5430 values = vld1q_f32((const float32_t*)(src1 + x + 4)); 5431 low = vld1q_f32((const float32_t*)(src2 + x + 4)); 5432 high = vld1q_f32((const float32_t*)(src3 + x + 4)); 5433 5434 uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values))); 5435 5436 uint16x8_t res_16 = vcombine_u16(r1, r2); 5437 5438 vst1_u8(dst + x, vmovn_u16(res_16)); 5439 } 5440 return x; 5441 } 5442 }; 5443 5444 #endif 5445 5446 template <typename T> 5447 static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2, 5448 const T* src3, size_t step3, uchar* dst, size_t step, 5449 Size size) 5450 { 5451 step1 /= sizeof(src1[0]); 5452 step2 /= sizeof(src2[0]); 5453 step3 /= sizeof(src3[0]); 5454 5455 InRange_SIMD<T> vop; 5456 5457 for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step ) 5458 { 5459 int x = vop(src1, src2, src3, dst, size.width); 5460 #if CV_ENABLE_UNROLLED 5461 for( ; x <= size.width - 4; x += 4 ) 5462 { 5463 int t0, t1; 5464 t0 = src2[x] <= src1[x] && src1[x] <= src3[x]; 5465 t1 = src2[x+1] <= src1[x+1] && src1[x+1] <= src3[x+1]; 5466 dst[x] = (uchar)-t0; dst[x+1] = (uchar)-t1; 5467 t0 = src2[x+2] <= src1[x+2] && src1[x+2] <= src3[x+2]; 5468 t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3]; 5469 dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1; 5470 } 5471 #endif 5472 for( ; x < size.width; x++ ) 5473 dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]); 5474 } 5475 } 5476 5477 5478 static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, 5479 const uchar* src3, size_t step3, uchar* dst, size_t step, Size size) 5480 { 5481 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size); 5482 } 5483 5484 static void inRange8s(const schar* src1, size_t step1, const schar* src2, size_t step2, 5485 const schar* src3, size_t step3, uchar* dst, size_t step, Size size) 5486 { 5487 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size); 5488 } 5489 5490 static void inRange16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, 5491 const ushort* src3, size_t step3, uchar* dst, size_t step, Size size) 5492 { 5493 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size); 5494 } 5495 5496 static void inRange16s(const short* src1, size_t step1, const short* src2, size_t step2, 5497 const short* src3, size_t step3, uchar* dst, size_t step, Size size) 5498 { 5499 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size); 5500 } 5501 5502 static void inRange32s(const int* src1, size_t step1, const int* src2, size_t step2, 5503 const int* src3, size_t step3, uchar* dst, size_t step, Size size) 5504 { 5505 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size); 5506 } 5507 5508 static void inRange32f(const float* src1, size_t step1, const float* src2, size_t step2, 5509 const float* src3, size_t step3, uchar* dst, size_t step, Size size) 5510 { 5511 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size); 5512 } 5513 5514 static void inRange64f(const double* src1, size_t step1, const double* src2, size_t step2, 5515 const double* src3, size_t step3, uchar* dst, size_t step, Size size) 5516 { 5517 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size); 5518 } 5519 5520 static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn) 5521 { 5522 int k = cn % 4 ? cn % 4 : 4; 5523 size_t i, j; 5524 if( k == 1 ) 5525 for( i = j = 0; i < len; i++, j += cn ) 5526 dst[i] = src[j]; 5527 else if( k == 2 ) 5528 for( i = j = 0; i < len; i++, j += cn ) 5529 dst[i] = src[j] & src[j+1]; 5530 else if( k == 3 ) 5531 for( i = j = 0; i < len; i++, j += cn ) 5532 dst[i] = src[j] & src[j+1] & src[j+2]; 5533 else 5534 for( i = j = 0; i < len; i++, j += cn ) 5535 dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3]; 5536 5537 for( ; k < cn; k += 4 ) 5538 { 5539 for( i = 0, j = k; i < len; i++, j += cn ) 5540 dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3]; 5541 } 5542 } 5543 5544 typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2, 5545 const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz ); 5546 5547 static InRangeFunc getInRangeFunc(int depth) 5548 { 5549 static InRangeFunc inRangeTab[] = 5550 { 5551 (InRangeFunc)GET_OPTIMIZED(inRange8u), (InRangeFunc)GET_OPTIMIZED(inRange8s), (InRangeFunc)GET_OPTIMIZED(inRange16u), 5552 (InRangeFunc)GET_OPTIMIZED(inRange16s), (InRangeFunc)GET_OPTIMIZED(inRange32s), (InRangeFunc)GET_OPTIMIZED(inRange32f), 5553 (InRangeFunc)inRange64f, 0 5554 }; 5555 5556 return inRangeTab[depth]; 5557 } 5558 5559 #ifdef HAVE_OPENCL 5560 5561 static bool ocl_inRange( InputArray _src, InputArray _lowerb, 5562 InputArray _upperb, OutputArray _dst ) 5563 { 5564 const ocl::Device & d = ocl::Device::getDefault(); 5565 int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind(); 5566 Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size(); 5567 int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type(); 5568 int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype); 5569 int cn = CV_MAT_CN(stype), rowsPerWI = d.isIntel() ? 4 : 1; 5570 bool lbScalar = false, ubScalar = false; 5571 5572 if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) || 5573 ssize != lsize || stype != ltype ) 5574 { 5575 if( !checkScalar(_lowerb, stype, lkind, skind) ) 5576 CV_Error( CV_StsUnmatchedSizes, 5577 "The lower bounary is neither an array of the same size and same type as src, nor a scalar"); 5578 lbScalar = true; 5579 } 5580 5581 if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) || 5582 ssize != usize || stype != utype ) 5583 { 5584 if( !checkScalar(_upperb, stype, ukind, skind) ) 5585 CV_Error( CV_StsUnmatchedSizes, 5586 "The upper bounary is neither an array of the same size and same type as src, nor a scalar"); 5587 ubScalar = true; 5588 } 5589 5590 if (lbScalar != ubScalar) 5591 return false; 5592 5593 bool doubleSupport = d.doubleFPConfig() > 0, 5594 haveScalar = lbScalar && ubScalar; 5595 5596 if ( (!doubleSupport && sdepth == CV_64F) || 5597 (!haveScalar && (sdepth != ldepth || sdepth != udepth)) ) 5598 return false; 5599 5600 int kercn = haveScalar ? cn : std::max(std::min(ocl::predictOptimalVectorWidth(_src, _lowerb, _upperb, _dst), 4), cn); 5601 if (kercn % cn != 0) 5602 kercn = cn; 5603 int colsPerWI = kercn / cn; 5604 String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d", 5605 haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)), 5606 ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth, 5607 doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI); 5608 5609 ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, opts); 5610 if (ker.empty()) 5611 return false; 5612 5613 _dst.create(ssize, CV_8UC1); 5614 UMat src = _src.getUMat(), dst = _dst.getUMat(), lscalaru, uscalaru; 5615 Mat lscalar, uscalar; 5616 5617 if (lbScalar && ubScalar) 5618 { 5619 lscalar = _lowerb.getMat(); 5620 uscalar = _upperb.getMat(); 5621 5622 size_t esz = src.elemSize(); 5623 size_t blocksize = 36; 5624 5625 AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128); 5626 uchar *buf = alignPtr(_buf + blocksize*cn, 16); 5627 5628 if( ldepth != sdepth && sdepth < CV_32S ) 5629 { 5630 int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16); 5631 int* iubuf = ilbuf + cn; 5632 5633 BinaryFunc sccvtfunc = getConvertFunc(ldepth, CV_32S); 5634 sccvtfunc(lscalar.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0); 5635 sccvtfunc(uscalar.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0); 5636 int minval = cvRound(getMinVal(sdepth)), maxval = cvRound(getMaxVal(sdepth)); 5637 5638 for( int k = 0; k < cn; k++ ) 5639 { 5640 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval ) 5641 ilbuf[k] = minval+1, iubuf[k] = minval; 5642 } 5643 lscalar = Mat(cn, 1, CV_32S, ilbuf); 5644 uscalar = Mat(cn, 1, CV_32S, iubuf); 5645 } 5646 5647 lscalar.convertTo(lscalar, stype); 5648 uscalar.convertTo(uscalar, stype); 5649 } 5650 else 5651 { 5652 lscalaru = _lowerb.getUMat(); 5653 uscalaru = _upperb.getUMat(); 5654 } 5655 5656 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), 5657 dstarg = ocl::KernelArg::WriteOnly(dst, 1, colsPerWI); 5658 5659 if (haveScalar) 5660 { 5661 lscalar.copyTo(lscalaru); 5662 uscalar.copyTo(uscalaru); 5663 5664 ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru), 5665 ocl::KernelArg::PtrReadOnly(uscalaru), rowsPerWI); 5666 } 5667 else 5668 ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru), 5669 ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI); 5670 5671 size_t globalsize[2] = { ssize.width / colsPerWI, (ssize.height + rowsPerWI - 1) / rowsPerWI }; 5672 return ker.run(2, globalsize, NULL, false); 5673 } 5674 5675 #endif 5676 5677 } 5678 5679 void cv::inRange(InputArray _src, InputArray _lowerb, 5680 InputArray _upperb, OutputArray _dst) 5681 { 5682 CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 && 5683 _upperb.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()), 5684 ocl_inRange(_src, _lowerb, _upperb, _dst)) 5685 5686 int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind(); 5687 Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat(); 5688 5689 bool lbScalar = false, ubScalar = false; 5690 5691 if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) || 5692 src.size != lb.size || src.type() != lb.type() ) 5693 { 5694 if( !checkScalar(lb, src.type(), lkind, skind) ) 5695 CV_Error( CV_StsUnmatchedSizes, 5696 "The lower bounary is neither an array of the same size and same type as src, nor a scalar"); 5697 lbScalar = true; 5698 } 5699 5700 if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) || 5701 src.size != ub.size || src.type() != ub.type() ) 5702 { 5703 if( !checkScalar(ub, src.type(), ukind, skind) ) 5704 CV_Error( CV_StsUnmatchedSizes, 5705 "The upper bounary is neither an array of the same size and same type as src, nor a scalar"); 5706 ubScalar = true; 5707 } 5708 5709 CV_Assert(lbScalar == ubScalar); 5710 5711 int cn = src.channels(), depth = src.depth(); 5712 5713 size_t esz = src.elemSize(); 5714 size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz; 5715 5716 _dst.create(src.dims, src.size, CV_8UC1); 5717 Mat dst = _dst.getMat(); 5718 InRangeFunc func = getInRangeFunc(depth); 5719 5720 const Mat* arrays_sc[] = { &src, &dst, 0 }; 5721 const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 }; 5722 uchar* ptrs[4]; 5723 5724 NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs); 5725 size_t total = it.size, blocksize = std::min(total, blocksize0); 5726 5727 AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128); 5728 uchar *buf = _buf, *mbuf = buf, *lbuf = 0, *ubuf = 0; 5729 buf = alignPtr(buf + blocksize*cn, 16); 5730 5731 if( lbScalar && ubScalar ) 5732 { 5733 lbuf = buf; 5734 ubuf = buf = alignPtr(buf + blocksize*esz, 16); 5735 5736 CV_Assert( lb.type() == ub.type() ); 5737 int scdepth = lb.depth(); 5738 5739 if( scdepth != depth && depth < CV_32S ) 5740 { 5741 int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16); 5742 int* iubuf = ilbuf + cn; 5743 5744 BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S); 5745 sccvtfunc(lb.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0); 5746 sccvtfunc(ub.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0); 5747 int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth)); 5748 5749 for( int k = 0; k < cn; k++ ) 5750 { 5751 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval ) 5752 ilbuf[k] = minval+1, iubuf[k] = minval; 5753 } 5754 lb = Mat(cn, 1, CV_32S, ilbuf); 5755 ub = Mat(cn, 1, CV_32S, iubuf); 5756 } 5757 5758 convertAndUnrollScalar( lb, src.type(), lbuf, blocksize ); 5759 convertAndUnrollScalar( ub, src.type(), ubuf, blocksize ); 5760 } 5761 5762 for( size_t i = 0; i < it.nplanes; i++, ++it ) 5763 { 5764 for( size_t j = 0; j < total; j += blocksize ) 5765 { 5766 int bsz = (int)MIN(total - j, blocksize); 5767 size_t delta = bsz*esz; 5768 uchar *lptr = lbuf, *uptr = ubuf; 5769 if( !lbScalar ) 5770 { 5771 lptr = ptrs[2]; 5772 ptrs[2] += delta; 5773 } 5774 if( !ubScalar ) 5775 { 5776 int idx = !lbScalar ? 3 : 2; 5777 uptr = ptrs[idx]; 5778 ptrs[idx] += delta; 5779 } 5780 func( ptrs[0], 0, lptr, 0, uptr, 0, cn == 1 ? ptrs[1] : mbuf, 0, Size(bsz*cn, 1)); 5781 if( cn > 1 ) 5782 inRangeReduce(mbuf, ptrs[1], bsz, cn); 5783 ptrs[0] += delta; 5784 ptrs[1] += bsz; 5785 } 5786 } 5787 } 5788 5789 /****************************************************************************************\ 5790 * Earlier API: cvAdd etc. * 5791 \****************************************************************************************/ 5792 5793 CV_IMPL void 5794 cvNot( const CvArr* srcarr, CvArr* dstarr ) 5795 { 5796 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 5797 CV_Assert( src.size == dst.size && src.type() == dst.type() ); 5798 cv::bitwise_not( src, dst ); 5799 } 5800 5801 5802 CV_IMPL void 5803 cvAnd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr ) 5804 { 5805 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2), 5806 dst = cv::cvarrToMat(dstarr), mask; 5807 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 5808 if( maskarr ) 5809 mask = cv::cvarrToMat(maskarr); 5810 cv::bitwise_and( src1, src2, dst, mask ); 5811 } 5812 5813 5814 CV_IMPL void 5815 cvOr( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr ) 5816 { 5817 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2), 5818 dst = cv::cvarrToMat(dstarr), mask; 5819 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 5820 if( maskarr ) 5821 mask = cv::cvarrToMat(maskarr); 5822 cv::bitwise_or( src1, src2, dst, mask ); 5823 } 5824 5825 5826 CV_IMPL void 5827 cvXor( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr ) 5828 { 5829 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2), 5830 dst = cv::cvarrToMat(dstarr), mask; 5831 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 5832 if( maskarr ) 5833 mask = cv::cvarrToMat(maskarr); 5834 cv::bitwise_xor( src1, src2, dst, mask ); 5835 } 5836 5837 5838 CV_IMPL void 5839 cvAndS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr ) 5840 { 5841 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask; 5842 CV_Assert( src.size == dst.size && src.type() == dst.type() ); 5843 if( maskarr ) 5844 mask = cv::cvarrToMat(maskarr); 5845 cv::bitwise_and( src, (const cv::Scalar&)s, dst, mask ); 5846 } 5847 5848 5849 CV_IMPL void 5850 cvOrS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr ) 5851 { 5852 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask; 5853 CV_Assert( src.size == dst.size && src.type() == dst.type() ); 5854 if( maskarr ) 5855 mask = cv::cvarrToMat(maskarr); 5856 cv::bitwise_or( src, (const cv::Scalar&)s, dst, mask ); 5857 } 5858 5859 5860 CV_IMPL void 5861 cvXorS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr ) 5862 { 5863 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask; 5864 CV_Assert( src.size == dst.size && src.type() == dst.type() ); 5865 if( maskarr ) 5866 mask = cv::cvarrToMat(maskarr); 5867 cv::bitwise_xor( src, (const cv::Scalar&)s, dst, mask ); 5868 } 5869 5870 5871 CV_IMPL void cvAdd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr ) 5872 { 5873 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2), 5874 dst = cv::cvarrToMat(dstarr), mask; 5875 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() ); 5876 if( maskarr ) 5877 mask = cv::cvarrToMat(maskarr); 5878 cv::add( src1, src2, dst, mask, dst.type() ); 5879 } 5880 5881 5882 CV_IMPL void cvSub( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr ) 5883 { 5884 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2), 5885 dst = cv::cvarrToMat(dstarr), mask; 5886 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() ); 5887 if( maskarr ) 5888 mask = cv::cvarrToMat(maskarr); 5889 cv::subtract( src1, src2, dst, mask, dst.type() ); 5890 } 5891 5892 5893 CV_IMPL void cvAddS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr ) 5894 { 5895 cv::Mat src1 = cv::cvarrToMat(srcarr1), 5896 dst = cv::cvarrToMat(dstarr), mask; 5897 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() ); 5898 if( maskarr ) 5899 mask = cv::cvarrToMat(maskarr); 5900 cv::add( src1, (const cv::Scalar&)value, dst, mask, dst.type() ); 5901 } 5902 5903 5904 CV_IMPL void cvSubRS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr ) 5905 { 5906 cv::Mat src1 = cv::cvarrToMat(srcarr1), 5907 dst = cv::cvarrToMat(dstarr), mask; 5908 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() ); 5909 if( maskarr ) 5910 mask = cv::cvarrToMat(maskarr); 5911 cv::subtract( (const cv::Scalar&)value, src1, dst, mask, dst.type() ); 5912 } 5913 5914 5915 CV_IMPL void cvMul( const CvArr* srcarr1, const CvArr* srcarr2, 5916 CvArr* dstarr, double scale ) 5917 { 5918 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2), 5919 dst = cv::cvarrToMat(dstarr); 5920 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() ); 5921 cv::multiply( src1, src2, dst, scale, dst.type() ); 5922 } 5923 5924 5925 CV_IMPL void cvDiv( const CvArr* srcarr1, const CvArr* srcarr2, 5926 CvArr* dstarr, double scale ) 5927 { 5928 cv::Mat src2 = cv::cvarrToMat(srcarr2), 5929 dst = cv::cvarrToMat(dstarr), mask; 5930 CV_Assert( src2.size == dst.size && src2.channels() == dst.channels() ); 5931 5932 if( srcarr1 ) 5933 cv::divide( cv::cvarrToMat(srcarr1), src2, dst, scale, dst.type() ); 5934 else 5935 cv::divide( scale, src2, dst, dst.type() ); 5936 } 5937 5938 5939 CV_IMPL void 5940 cvAddWeighted( const CvArr* srcarr1, double alpha, 5941 const CvArr* srcarr2, double beta, 5942 double gamma, CvArr* dstarr ) 5943 { 5944 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2), 5945 dst = cv::cvarrToMat(dstarr); 5946 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() ); 5947 cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() ); 5948 } 5949 5950 5951 CV_IMPL void 5952 cvAbsDiff( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr ) 5953 { 5954 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 5955 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 5956 5957 cv::absdiff( src1, cv::cvarrToMat(srcarr2), dst ); 5958 } 5959 5960 5961 CV_IMPL void 5962 cvAbsDiffS( const CvArr* srcarr1, CvArr* dstarr, CvScalar scalar ) 5963 { 5964 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 5965 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 5966 5967 cv::absdiff( src1, (const cv::Scalar&)scalar, dst ); 5968 } 5969 5970 5971 CV_IMPL void 5972 cvInRange( const void* srcarr1, const void* srcarr2, 5973 const void* srcarr3, void* dstarr ) 5974 { 5975 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 5976 CV_Assert( src1.size == dst.size && dst.type() == CV_8U ); 5977 5978 cv::inRange( src1, cv::cvarrToMat(srcarr2), cv::cvarrToMat(srcarr3), dst ); 5979 } 5980 5981 5982 CV_IMPL void 5983 cvInRangeS( const void* srcarr1, CvScalar lowerb, CvScalar upperb, void* dstarr ) 5984 { 5985 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 5986 CV_Assert( src1.size == dst.size && dst.type() == CV_8U ); 5987 5988 cv::inRange( src1, (const cv::Scalar&)lowerb, (const cv::Scalar&)upperb, dst ); 5989 } 5990 5991 5992 CV_IMPL void 5993 cvCmp( const void* srcarr1, const void* srcarr2, void* dstarr, int cmp_op ) 5994 { 5995 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 5996 CV_Assert( src1.size == dst.size && dst.type() == CV_8U ); 5997 5998 cv::compare( src1, cv::cvarrToMat(srcarr2), dst, cmp_op ); 5999 } 6000 6001 6002 CV_IMPL void 6003 cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op ) 6004 { 6005 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 6006 CV_Assert( src1.size == dst.size && dst.type() == CV_8U ); 6007 6008 cv::compare( src1, value, dst, cmp_op ); 6009 } 6010 6011 6012 CV_IMPL void 6013 cvMin( const void* srcarr1, const void* srcarr2, void* dstarr ) 6014 { 6015 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 6016 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 6017 6018 cv::min( src1, cv::cvarrToMat(srcarr2), dst ); 6019 } 6020 6021 6022 CV_IMPL void 6023 cvMax( const void* srcarr1, const void* srcarr2, void* dstarr ) 6024 { 6025 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 6026 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 6027 6028 cv::max( src1, cv::cvarrToMat(srcarr2), dst ); 6029 } 6030 6031 6032 CV_IMPL void 6033 cvMinS( const void* srcarr1, double value, void* dstarr ) 6034 { 6035 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 6036 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 6037 6038 cv::min( src1, value, dst ); 6039 } 6040 6041 6042 CV_IMPL void 6043 cvMaxS( const void* srcarr1, double value, void* dstarr ) 6044 { 6045 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 6046 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 6047 6048 cv::max( src1, value, dst ); 6049 } 6050 6051 /* End of file. */ 6052