1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #include "precomp.hpp" 45 #include "opencl_kernels_core.hpp" 46 47 #ifdef __APPLE__ 48 #undef CV_NEON 49 #define CV_NEON 0 50 #endif 51 52 namespace cv 53 { 54 55 /****************************************************************************************\ 56 * split & merge * 57 \****************************************************************************************/ 58 59 #if CV_NEON 60 template<typename T> struct VSplit2; 61 template<typename T> struct VSplit3; 62 template<typename T> struct VSplit4; 63 64 #define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ 65 template<> \ 66 struct name<data_type> \ 67 { \ 68 void operator()(const data_type* src, data_type* dst0, \ 69 data_type* dst1) const \ 70 { \ 71 reg_type r = load_func(src); \ 72 store_func(dst0, r.val[0]); \ 73 store_func(dst1, r.val[1]); \ 74 } \ 75 } 76 77 #define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ 78 template<> \ 79 struct name<data_type> \ 80 { \ 81 void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ 82 data_type* dst2) const \ 83 { \ 84 reg_type r = load_func(src); \ 85 store_func(dst0, r.val[0]); \ 86 store_func(dst1, r.val[1]); \ 87 store_func(dst2, r.val[2]); \ 88 } \ 89 } 90 91 #define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ 92 template<> \ 93 struct name<data_type> \ 94 { \ 95 void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ 96 data_type* dst2, data_type* dst3) const \ 97 { \ 98 reg_type r = load_func(src); \ 99 store_func(dst0, r.val[0]); \ 100 store_func(dst1, r.val[1]); \ 101 store_func(dst2, r.val[2]); \ 102 store_func(dst3, r.val[3]); \ 103 } \ 104 } 105 106 SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); 107 SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); 108 SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); 109 SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); 110 111 SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); 112 SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); 113 SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); 114 SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); 115 116 SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); 117 SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); 118 SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); 119 SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); 120 121 #elif CV_SSE2 122 123 template <typename T> 124 struct VSplit2 125 { 126 VSplit2() : support(false) { } 127 void operator()(const T *, T *, T *) const { } 128 129 bool support; 130 }; 131 132 template <typename T> 133 struct VSplit3 134 { 135 VSplit3() : support(false) { } 136 void operator()(const T *, T *, T *, T *) const { } 137 138 bool support; 139 }; 140 141 template <typename T> 142 struct VSplit4 143 { 144 VSplit4() : support(false) { } 145 void operator()(const T *, T *, T *, T *, T *) const { } 146 147 bool support; 148 }; 149 150 #define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ 151 template <> \ 152 struct VSplit2<data_type> \ 153 { \ 154 enum \ 155 { \ 156 ELEMS_IN_VEC = 16 / sizeof(data_type) \ 157 }; \ 158 \ 159 VSplit2() \ 160 { \ 161 support = checkHardwareSupport(CV_CPU_SSE2); \ 162 } \ 163 \ 164 void operator()(const data_type * src, \ 165 data_type * dst0, data_type * dst1) const \ 166 { \ 167 reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ 168 reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ 169 reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ 170 reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ 171 \ 172 _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ 173 \ 174 _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ 175 _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ 176 _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ 177 _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ 178 } \ 179 \ 180 bool support; \ 181 } 182 183 #define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ 184 template <> \ 185 struct VSplit3<data_type> \ 186 { \ 187 enum \ 188 { \ 189 ELEMS_IN_VEC = 16 / sizeof(data_type) \ 190 }; \ 191 \ 192 VSplit3() \ 193 { \ 194 support = checkHardwareSupport(CV_CPU_SSE2); \ 195 } \ 196 \ 197 void operator()(const data_type * src, \ 198 data_type * dst0, data_type * dst1, data_type * dst2) const \ 199 { \ 200 reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ 201 reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ 202 reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ 203 reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ 204 reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ 205 reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ 206 \ 207 _mm_deinterleave(v_src0, v_src1, v_src2, \ 208 v_src3, v_src4, v_src5); \ 209 \ 210 _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ 211 _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ 212 _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ 213 _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ 214 _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ 215 _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ 216 } \ 217 \ 218 bool support; \ 219 } 220 221 #define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ 222 template <> \ 223 struct VSplit4<data_type> \ 224 { \ 225 enum \ 226 { \ 227 ELEMS_IN_VEC = 16 / sizeof(data_type) \ 228 }; \ 229 \ 230 VSplit4() \ 231 { \ 232 support = checkHardwareSupport(CV_CPU_SSE2); \ 233 } \ 234 \ 235 void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ 236 data_type * dst2, data_type * dst3) const \ 237 { \ 238 reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ 239 reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ 240 reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ 241 reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ 242 reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ 243 reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ 244 reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ 245 reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ 246 \ 247 _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ 248 v_src4, v_src5, v_src6, v_src7); \ 249 \ 250 _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ 251 _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ 252 _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ 253 _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ 254 _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ 255 _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ 256 _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ 257 _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ 258 } \ 259 \ 260 bool support; \ 261 } 262 263 SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); 264 SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); 265 SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); 266 267 SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); 268 SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); 269 SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); 270 271 SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); 272 SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); 273 SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); 274 275 #endif 276 277 template<typename T> static void 278 split_( const T* src, T** dst, int len, int cn ) 279 { 280 int k = cn % 4 ? cn % 4 : 4; 281 int i, j; 282 if( k == 1 ) 283 { 284 T* dst0 = dst[0]; 285 286 if(cn == 1) 287 { 288 memcpy(dst0, src, len * sizeof(T)); 289 } 290 else 291 { 292 for( i = 0, j = 0 ; i < len; i++, j += cn ) 293 dst0[i] = src[j]; 294 } 295 } 296 else if( k == 2 ) 297 { 298 T *dst0 = dst[0], *dst1 = dst[1]; 299 i = j = 0; 300 301 #if CV_NEON 302 if(cn == 2) 303 { 304 int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); 305 int inc_j = 2 * inc_i; 306 307 VSplit2<T> vsplit; 308 for( ; i < len - inc_i; i += inc_i, j += inc_j) 309 vsplit(src + j, dst0 + i, dst1 + i); 310 } 311 #elif CV_SSE2 312 if (cn == 2) 313 { 314 int inc_i = 32/sizeof(T); 315 int inc_j = 2 * inc_i; 316 317 VSplit2<T> vsplit; 318 if (vsplit.support) 319 { 320 for( ; i <= len - inc_i; i += inc_i, j += inc_j) 321 vsplit(src + j, dst0 + i, dst1 + i); 322 } 323 } 324 #endif 325 for( ; i < len; i++, j += cn ) 326 { 327 dst0[i] = src[j]; 328 dst1[i] = src[j+1]; 329 } 330 } 331 else if( k == 3 ) 332 { 333 T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; 334 i = j = 0; 335 336 #if CV_NEON 337 if(cn == 3) 338 { 339 int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); 340 int inc_j = 3 * inc_i; 341 342 VSplit3<T> vsplit; 343 for( ; i <= len - inc_i; i += inc_i, j += inc_j) 344 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); 345 } 346 #elif CV_SSE2 347 if (cn == 3) 348 { 349 int inc_i = 32/sizeof(T); 350 int inc_j = 3 * inc_i; 351 352 VSplit3<T> vsplit; 353 354 if (vsplit.support) 355 { 356 for( ; i <= len - inc_i; i += inc_i, j += inc_j) 357 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); 358 } 359 } 360 #endif 361 for( ; i < len; i++, j += cn ) 362 { 363 dst0[i] = src[j]; 364 dst1[i] = src[j+1]; 365 dst2[i] = src[j+2]; 366 } 367 } 368 else 369 { 370 T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; 371 i = j = 0; 372 373 #if CV_NEON 374 if(cn == 4) 375 { 376 int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); 377 int inc_j = 4 * inc_i; 378 379 VSplit4<T> vsplit; 380 for( ; i <= len - inc_i; i += inc_i, j += inc_j) 381 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); 382 } 383 #elif CV_SSE2 384 if (cn == 4) 385 { 386 int inc_i = 32/sizeof(T); 387 int inc_j = 4 * inc_i; 388 389 VSplit4<T> vsplit; 390 if (vsplit.support) 391 { 392 for( ; i <= len - inc_i; i += inc_i, j += inc_j) 393 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); 394 } 395 } 396 #endif 397 for( ; i < len; i++, j += cn ) 398 { 399 dst0[i] = src[j]; dst1[i] = src[j+1]; 400 dst2[i] = src[j+2]; dst3[i] = src[j+3]; 401 } 402 } 403 404 for( ; k < cn; k += 4 ) 405 { 406 T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3]; 407 for( i = 0, j = k; i < len; i++, j += cn ) 408 { 409 dst0[i] = src[j]; dst1[i] = src[j+1]; 410 dst2[i] = src[j+2]; dst3[i] = src[j+3]; 411 } 412 } 413 } 414 415 416 #if CV_NEON 417 template<typename T> struct VMerge2; 418 template<typename T> struct VMerge3; 419 template<typename T> struct VMerge4; 420 421 #define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ 422 template<> \ 423 struct name<data_type>{ \ 424 void operator()(const data_type* src0, const data_type* src1, \ 425 data_type* dst){ \ 426 reg_type r; \ 427 r.val[0] = load_func(src0); \ 428 r.val[1] = load_func(src1); \ 429 store_func(dst, r); \ 430 } \ 431 } 432 433 #define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ 434 template<> \ 435 struct name<data_type>{ \ 436 void operator()(const data_type* src0, const data_type* src1, \ 437 const data_type* src2, data_type* dst){ \ 438 reg_type r; \ 439 r.val[0] = load_func(src0); \ 440 r.val[1] = load_func(src1); \ 441 r.val[2] = load_func(src2); \ 442 store_func(dst, r); \ 443 } \ 444 } 445 446 #define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ 447 template<> \ 448 struct name<data_type>{ \ 449 void operator()(const data_type* src0, const data_type* src1, \ 450 const data_type* src2, const data_type* src3, \ 451 data_type* dst){ \ 452 reg_type r; \ 453 r.val[0] = load_func(src0); \ 454 r.val[1] = load_func(src1); \ 455 r.val[2] = load_func(src2); \ 456 r.val[3] = load_func(src3); \ 457 store_func(dst, r); \ 458 } \ 459 } 460 461 MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); 462 MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); 463 MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); 464 MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); 465 466 MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); 467 MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); 468 MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); 469 MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); 470 471 MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); 472 MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); 473 MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); 474 MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); 475 476 #elif CV_SSE2 477 478 template <typename T> 479 struct VMerge2 480 { 481 VMerge2() : support(false) { } 482 void operator()(const T *, const T *, T *) const { } 483 484 bool support; 485 }; 486 487 template <typename T> 488 struct VMerge3 489 { 490 VMerge3() : support(false) { } 491 void operator()(const T *, const T *, const T *, T *) const { } 492 493 bool support; 494 }; 495 496 template <typename T> 497 struct VMerge4 498 { 499 VMerge4() : support(false) { } 500 void operator()(const T *, const T *, const T *, const T *, T *) const { } 501 502 bool support; 503 }; 504 505 #define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ 506 template <> \ 507 struct VMerge2<data_type> \ 508 { \ 509 enum \ 510 { \ 511 ELEMS_IN_VEC = 16 / sizeof(data_type) \ 512 }; \ 513 \ 514 VMerge2() \ 515 { \ 516 support = checkHardwareSupport(se); \ 517 } \ 518 \ 519 void operator()(const data_type * src0, const data_type * src1, \ 520 data_type * dst) const \ 521 { \ 522 reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ 523 reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ 524 reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ 525 reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ 526 \ 527 _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ 528 \ 529 _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ 530 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ 531 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ 532 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ 533 } \ 534 \ 535 bool support; \ 536 } 537 538 #define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ 539 template <> \ 540 struct VMerge3<data_type> \ 541 { \ 542 enum \ 543 { \ 544 ELEMS_IN_VEC = 16 / sizeof(data_type) \ 545 }; \ 546 \ 547 VMerge3() \ 548 { \ 549 support = checkHardwareSupport(se); \ 550 } \ 551 \ 552 void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ 553 data_type * dst) const \ 554 { \ 555 reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ 556 reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ 557 reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ 558 reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ 559 reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ 560 reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ 561 \ 562 _mm_interleave(v_src0, v_src1, v_src2, \ 563 v_src3, v_src4, v_src5); \ 564 \ 565 _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ 566 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ 567 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ 568 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ 569 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ 570 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ 571 } \ 572 \ 573 bool support; \ 574 } 575 576 #define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ 577 template <> \ 578 struct VMerge4<data_type> \ 579 { \ 580 enum \ 581 { \ 582 ELEMS_IN_VEC = 16 / sizeof(data_type) \ 583 }; \ 584 \ 585 VMerge4() \ 586 { \ 587 support = checkHardwareSupport(se); \ 588 } \ 589 \ 590 void operator()(const data_type * src0, const data_type * src1, \ 591 const data_type * src2, const data_type * src3, \ 592 data_type * dst) const \ 593 { \ 594 reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ 595 reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ 596 reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ 597 reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ 598 reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ 599 reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ 600 reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ 601 reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ 602 \ 603 _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ 604 v_src4, v_src5, v_src6, v_src7); \ 605 \ 606 _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ 607 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ 608 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ 609 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ 610 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ 611 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ 612 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ 613 _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ 614 } \ 615 \ 616 bool support; \ 617 } 618 619 MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); 620 MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); 621 MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); 622 623 #if CV_SSE4_1 624 MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); 625 MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); 626 MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); 627 #endif 628 629 MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); 630 MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); 631 MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); 632 633 #endif 634 635 template<typename T> static void 636 merge_( const T** src, T* dst, int len, int cn ) 637 { 638 int k = cn % 4 ? cn % 4 : 4; 639 int i, j; 640 if( k == 1 ) 641 { 642 const T* src0 = src[0]; 643 for( i = j = 0; i < len; i++, j += cn ) 644 dst[j] = src0[i]; 645 } 646 else if( k == 2 ) 647 { 648 const T *src0 = src[0], *src1 = src[1]; 649 i = j = 0; 650 #if CV_NEON 651 if(cn == 2) 652 { 653 int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); 654 int inc_j = 2 * inc_i; 655 656 VMerge2<T> vmerge; 657 for( ; i < len - inc_i; i += inc_i, j += inc_j) 658 vmerge(src0 + i, src1 + i, dst + j); 659 } 660 #elif CV_SSE2 661 if(cn == 2) 662 { 663 int inc_i = 32/sizeof(T); 664 int inc_j = 2 * inc_i; 665 666 VMerge2<T> vmerge; 667 if (vmerge.support) 668 for( ; i < len - inc_i; i += inc_i, j += inc_j) 669 vmerge(src0 + i, src1 + i, dst + j); 670 } 671 #endif 672 for( ; i < len; i++, j += cn ) 673 { 674 dst[j] = src0[i]; 675 dst[j+1] = src1[i]; 676 } 677 } 678 else if( k == 3 ) 679 { 680 const T *src0 = src[0], *src1 = src[1], *src2 = src[2]; 681 i = j = 0; 682 #if CV_NEON 683 if(cn == 3) 684 { 685 int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); 686 int inc_j = 3 * inc_i; 687 688 VMerge3<T> vmerge; 689 for( ; i < len - inc_i; i += inc_i, j += inc_j) 690 vmerge(src0 + i, src1 + i, src2 + i, dst + j); 691 } 692 #elif CV_SSE2 693 if(cn == 3) 694 { 695 int inc_i = 32/sizeof(T); 696 int inc_j = 3 * inc_i; 697 698 VMerge3<T> vmerge; 699 if (vmerge.support) 700 for( ; i < len - inc_i; i += inc_i, j += inc_j) 701 vmerge(src0 + i, src1 + i, src2 + i, dst + j); 702 } 703 #endif 704 for( ; i < len; i++, j += cn ) 705 { 706 dst[j] = src0[i]; 707 dst[j+1] = src1[i]; 708 dst[j+2] = src2[i]; 709 } 710 } 711 else 712 { 713 const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3]; 714 i = j = 0; 715 #if CV_NEON 716 if(cn == 4) 717 { 718 int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); 719 int inc_j = 4 * inc_i; 720 721 VMerge4<T> vmerge; 722 for( ; i < len - inc_i; i += inc_i, j += inc_j) 723 vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); 724 } 725 #elif CV_SSE2 726 if(cn == 4) 727 { 728 int inc_i = 32/sizeof(T); 729 int inc_j = 4 * inc_i; 730 731 VMerge4<T> vmerge; 732 if (vmerge.support) 733 for( ; i < len - inc_i; i += inc_i, j += inc_j) 734 vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); 735 } 736 #endif 737 for( ; i < len; i++, j += cn ) 738 { 739 dst[j] = src0[i]; dst[j+1] = src1[i]; 740 dst[j+2] = src2[i]; dst[j+3] = src3[i]; 741 } 742 } 743 744 for( ; k < cn; k += 4 ) 745 { 746 const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3]; 747 for( i = 0, j = k; i < len; i++, j += cn ) 748 { 749 dst[j] = src0[i]; dst[j+1] = src1[i]; 750 dst[j+2] = src2[i]; dst[j+3] = src3[i]; 751 } 752 } 753 } 754 755 static void split8u(const uchar* src, uchar** dst, int len, int cn ) 756 { 757 split_(src, dst, len, cn); 758 } 759 760 static void split16u(const ushort* src, ushort** dst, int len, int cn ) 761 { 762 split_(src, dst, len, cn); 763 } 764 765 static void split32s(const int* src, int** dst, int len, int cn ) 766 { 767 split_(src, dst, len, cn); 768 } 769 770 static void split64s(const int64* src, int64** dst, int len, int cn ) 771 { 772 split_(src, dst, len, cn); 773 } 774 775 static void merge8u(const uchar** src, uchar* dst, int len, int cn ) 776 { 777 merge_(src, dst, len, cn); 778 } 779 780 static void merge16u(const ushort** src, ushort* dst, int len, int cn ) 781 { 782 merge_(src, dst, len, cn); 783 } 784 785 static void merge32s(const int** src, int* dst, int len, int cn ) 786 { 787 merge_(src, dst, len, cn); 788 } 789 790 static void merge64s(const int64** src, int64* dst, int len, int cn ) 791 { 792 merge_(src, dst, len, cn); 793 } 794 795 typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn); 796 typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn); 797 798 static SplitFunc getSplitFunc(int depth) 799 { 800 static SplitFunc splitTab[] = 801 { 802 (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split16u), (SplitFunc)GET_OPTIMIZED(split16u), 803 (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split64s), 0 804 }; 805 806 return splitTab[depth]; 807 } 808 809 static MergeFunc getMergeFunc(int depth) 810 { 811 static MergeFunc mergeTab[] = 812 { 813 (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge16u), (MergeFunc)GET_OPTIMIZED(merge16u), 814 (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge64s), 0 815 }; 816 817 return mergeTab[depth]; 818 } 819 820 } 821 822 void cv::split(const Mat& src, Mat* mv) 823 { 824 int k, depth = src.depth(), cn = src.channels(); 825 if( cn == 1 ) 826 { 827 src.copyTo(mv[0]); 828 return; 829 } 830 831 SplitFunc func = getSplitFunc(depth); 832 CV_Assert( func != 0 ); 833 834 int esz = (int)src.elemSize(), esz1 = (int)src.elemSize1(); 835 int blocksize0 = (BLOCK_SIZE + esz-1)/esz; 836 AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16); 837 const Mat** arrays = (const Mat**)(uchar*)_buf; 838 uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16); 839 840 arrays[0] = &src; 841 for( k = 0; k < cn; k++ ) 842 { 843 mv[k].create(src.dims, src.size, depth); 844 arrays[k+1] = &mv[k]; 845 } 846 847 NAryMatIterator it(arrays, ptrs, cn+1); 848 int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0); 849 850 for( size_t i = 0; i < it.nplanes; i++, ++it ) 851 { 852 for( int j = 0; j < total; j += blocksize ) 853 { 854 int bsz = std::min(total - j, blocksize); 855 func( ptrs[0], &ptrs[1], bsz, cn ); 856 857 if( j + blocksize < total ) 858 { 859 ptrs[0] += bsz*esz; 860 for( k = 0; k < cn; k++ ) 861 ptrs[k+1] += bsz*esz1; 862 } 863 } 864 } 865 } 866 867 #ifdef HAVE_OPENCL 868 869 namespace cv { 870 871 static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv ) 872 { 873 int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), 874 rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; 875 876 String dstargs, processelem, indexdecl; 877 for (int i = 0; i < cn; ++i) 878 { 879 dstargs += format("DECLARE_DST_PARAM(%d)", i); 880 indexdecl += format("DECLARE_INDEX(%d)", i); 881 processelem += format("PROCESS_ELEM(%d)", i); 882 } 883 884 ocl::Kernel k("split", ocl::core::split_merge_oclsrc, 885 format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s" 886 " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s", 887 ocl::memopTypeToStr(depth), cn, dstargs.c_str(), 888 processelem.c_str(), indexdecl.c_str())); 889 if (k.empty()) 890 return false; 891 892 Size size = _m.size(); 893 _mv.create(cn, 1, depth); 894 for (int i = 0; i < cn; ++i) 895 _mv.create(size, depth, i); 896 897 std::vector<UMat> dst; 898 _mv.getUMatVector(dst); 899 900 int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat())); 901 for (int i = 0; i < cn; ++i) 902 argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i])); 903 k.set(argidx, rowsPerWI); 904 905 size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI }; 906 return k.run(2, globalsize, NULL, false); 907 } 908 909 } 910 911 #endif 912 913 void cv::split(InputArray _m, OutputArrayOfArrays _mv) 914 { 915 CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(), 916 ocl_split(_m, _mv)) 917 918 Mat m = _m.getMat(); 919 if( m.empty() ) 920 { 921 _mv.release(); 922 return; 923 } 924 925 CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() ); 926 927 Size size = m.size(); 928 int depth = m.depth(), cn = m.channels(); 929 _mv.create(cn, 1, depth); 930 for (int i = 0; i < cn; ++i) 931 _mv.create(size, depth, i); 932 933 std::vector<Mat> dst; 934 _mv.getMatVector(dst); 935 936 split(m, &dst[0]); 937 } 938 939 void cv::merge(const Mat* mv, size_t n, OutputArray _dst) 940 { 941 CV_Assert( mv && n > 0 ); 942 943 int depth = mv[0].depth(); 944 bool allch1 = true; 945 int k, cn = 0; 946 size_t i; 947 948 for( i = 0; i < n; i++ ) 949 { 950 CV_Assert(mv[i].size == mv[0].size && mv[i].depth() == depth); 951 allch1 = allch1 && mv[i].channels() == 1; 952 cn += mv[i].channels(); 953 } 954 955 CV_Assert( 0 < cn && cn <= CV_CN_MAX ); 956 _dst.create(mv[0].dims, mv[0].size, CV_MAKETYPE(depth, cn)); 957 Mat dst = _dst.getMat(); 958 959 if( n == 1 ) 960 { 961 mv[0].copyTo(dst); 962 return; 963 } 964 965 if( !allch1 ) 966 { 967 AutoBuffer<int> pairs(cn*2); 968 int j, ni=0; 969 970 for( i = 0, j = 0; i < n; i++, j += ni ) 971 { 972 ni = mv[i].channels(); 973 for( k = 0; k < ni; k++ ) 974 { 975 pairs[(j+k)*2] = j + k; 976 pairs[(j+k)*2+1] = j + k; 977 } 978 } 979 mixChannels( mv, n, &dst, 1, &pairs[0], cn ); 980 return; 981 } 982 983 size_t esz = dst.elemSize(), esz1 = dst.elemSize1(); 984 int blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz); 985 AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16); 986 const Mat** arrays = (const Mat**)(uchar*)_buf; 987 uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16); 988 989 arrays[0] = &dst; 990 for( k = 0; k < cn; k++ ) 991 arrays[k+1] = &mv[k]; 992 993 NAryMatIterator it(arrays, ptrs, cn+1); 994 int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0); 995 MergeFunc func = getMergeFunc(depth); 996 997 for( i = 0; i < it.nplanes; i++, ++it ) 998 { 999 for( int j = 0; j < total; j += blocksize ) 1000 { 1001 int bsz = std::min(total - j, blocksize); 1002 func( (const uchar**)&ptrs[1], ptrs[0], bsz, cn ); 1003 1004 if( j + blocksize < total ) 1005 { 1006 ptrs[0] += bsz*esz; 1007 for( int t = 0; t < cn; t++ ) 1008 ptrs[t+1] += bsz*esz1; 1009 } 1010 } 1011 } 1012 } 1013 1014 #ifdef HAVE_OPENCL 1015 1016 namespace cv { 1017 1018 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst ) 1019 { 1020 std::vector<UMat> src, ksrc; 1021 _mv.getUMatVector(src); 1022 CV_Assert(!src.empty()); 1023 1024 int type = src[0].type(), depth = CV_MAT_DEPTH(type), 1025 rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; 1026 Size size = src[0].size(); 1027 1028 for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i) 1029 { 1030 int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype), 1031 esz1 = CV_ELEM_SIZE1(idepth); 1032 if (src[i].dims > 2) 1033 return false; 1034 1035 CV_Assert(size == src[i].size() && depth == idepth); 1036 1037 for (int cn = 0; cn < icn; ++cn) 1038 { 1039 UMat tsrc = src[i]; 1040 tsrc.offset += cn * esz1; 1041 ksrc.push_back(tsrc); 1042 } 1043 } 1044 int dcn = (int)ksrc.size(); 1045 1046 String srcargs, processelem, cndecl, indexdecl; 1047 for (int i = 0; i < dcn; ++i) 1048 { 1049 srcargs += format("DECLARE_SRC_PARAM(%d)", i); 1050 processelem += format("PROCESS_ELEM(%d)", i); 1051 indexdecl += format("DECLARE_INDEX(%d)", i); 1052 cndecl += format(" -D scn%d=%d", i, ksrc[i].channels()); 1053 } 1054 1055 ocl::Kernel k("merge", ocl::core::split_merge_oclsrc, 1056 format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s" 1057 " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s", 1058 dcn, ocl::memopTypeToStr(depth), srcargs.c_str(), 1059 indexdecl.c_str(), processelem.c_str(), cndecl.c_str())); 1060 if (k.empty()) 1061 return false; 1062 1063 _dst.create(size, CV_MAKE_TYPE(depth, dcn)); 1064 UMat dst = _dst.getUMat(); 1065 1066 int argidx = 0; 1067 for (int i = 0; i < dcn; ++i) 1068 argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i])); 1069 argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst)); 1070 k.set(argidx, rowsPerWI); 1071 1072 size_t globalsize[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI }; 1073 return k.run(2, globalsize, NULL, false); 1074 } 1075 1076 } 1077 1078 #endif 1079 1080 void cv::merge(InputArrayOfArrays _mv, OutputArray _dst) 1081 { 1082 CV_OCL_RUN(_mv.isUMatVector() && _dst.isUMat(), 1083 ocl_merge(_mv, _dst)) 1084 1085 std::vector<Mat> mv; 1086 _mv.getMatVector(mv); 1087 merge(!mv.empty() ? &mv[0] : 0, mv.size(), _dst); 1088 } 1089 1090 /****************************************************************************************\ 1091 * Generalized split/merge: mixing channels * 1092 \****************************************************************************************/ 1093 1094 namespace cv 1095 { 1096 1097 template<typename T> static void 1098 mixChannels_( const T** src, const int* sdelta, 1099 T** dst, const int* ddelta, 1100 int len, int npairs ) 1101 { 1102 int i, k; 1103 for( k = 0; k < npairs; k++ ) 1104 { 1105 const T* s = src[k]; 1106 T* d = dst[k]; 1107 int ds = sdelta[k], dd = ddelta[k]; 1108 if( s ) 1109 { 1110 for( i = 0; i <= len - 2; i += 2, s += ds*2, d += dd*2 ) 1111 { 1112 T t0 = s[0], t1 = s[ds]; 1113 d[0] = t0; d[dd] = t1; 1114 } 1115 if( i < len ) 1116 d[0] = s[0]; 1117 } 1118 else 1119 { 1120 for( i = 0; i <= len - 2; i += 2, d += dd*2 ) 1121 d[0] = d[dd] = 0; 1122 if( i < len ) 1123 d[0] = 0; 1124 } 1125 } 1126 } 1127 1128 1129 static void mixChannels8u( const uchar** src, const int* sdelta, 1130 uchar** dst, const int* ddelta, 1131 int len, int npairs ) 1132 { 1133 mixChannels_(src, sdelta, dst, ddelta, len, npairs); 1134 } 1135 1136 static void mixChannels16u( const ushort** src, const int* sdelta, 1137 ushort** dst, const int* ddelta, 1138 int len, int npairs ) 1139 { 1140 mixChannels_(src, sdelta, dst, ddelta, len, npairs); 1141 } 1142 1143 static void mixChannels32s( const int** src, const int* sdelta, 1144 int** dst, const int* ddelta, 1145 int len, int npairs ) 1146 { 1147 mixChannels_(src, sdelta, dst, ddelta, len, npairs); 1148 } 1149 1150 static void mixChannels64s( const int64** src, const int* sdelta, 1151 int64** dst, const int* ddelta, 1152 int len, int npairs ) 1153 { 1154 mixChannels_(src, sdelta, dst, ddelta, len, npairs); 1155 } 1156 1157 typedef void (*MixChannelsFunc)( const uchar** src, const int* sdelta, 1158 uchar** dst, const int* ddelta, int len, int npairs ); 1159 1160 static MixChannelsFunc getMixchFunc(int depth) 1161 { 1162 static MixChannelsFunc mixchTab[] = 1163 { 1164 (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels16u, 1165 (MixChannelsFunc)mixChannels16u, (MixChannelsFunc)mixChannels32s, (MixChannelsFunc)mixChannels32s, 1166 (MixChannelsFunc)mixChannels64s, 0 1167 }; 1168 1169 return mixchTab[depth]; 1170 } 1171 1172 } 1173 1174 void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, const int* fromTo, size_t npairs ) 1175 { 1176 if( npairs == 0 ) 1177 return; 1178 CV_Assert( src && nsrcs > 0 && dst && ndsts > 0 && fromTo && npairs > 0 ); 1179 1180 size_t i, j, k, esz1 = dst[0].elemSize1(); 1181 int depth = dst[0].depth(); 1182 1183 AutoBuffer<uchar> buf((nsrcs + ndsts + 1)*(sizeof(Mat*) + sizeof(uchar*)) + npairs*(sizeof(uchar*)*2 + sizeof(int)*6)); 1184 const Mat** arrays = (const Mat**)(uchar*)buf; 1185 uchar** ptrs = (uchar**)(arrays + nsrcs + ndsts); 1186 const uchar** srcs = (const uchar**)(ptrs + nsrcs + ndsts + 1); 1187 uchar** dsts = (uchar**)(srcs + npairs); 1188 int* tab = (int*)(dsts + npairs); 1189 int *sdelta = (int*)(tab + npairs*4), *ddelta = sdelta + npairs; 1190 1191 for( i = 0; i < nsrcs; i++ ) 1192 arrays[i] = &src[i]; 1193 for( i = 0; i < ndsts; i++ ) 1194 arrays[i + nsrcs] = &dst[i]; 1195 ptrs[nsrcs + ndsts] = 0; 1196 1197 for( i = 0; i < npairs; i++ ) 1198 { 1199 int i0 = fromTo[i*2], i1 = fromTo[i*2+1]; 1200 if( i0 >= 0 ) 1201 { 1202 for( j = 0; j < nsrcs; i0 -= src[j].channels(), j++ ) 1203 if( i0 < src[j].channels() ) 1204 break; 1205 CV_Assert(j < nsrcs && src[j].depth() == depth); 1206 tab[i*4] = (int)j; tab[i*4+1] = (int)(i0*esz1); 1207 sdelta[i] = src[j].channels(); 1208 } 1209 else 1210 { 1211 tab[i*4] = (int)(nsrcs + ndsts); tab[i*4+1] = 0; 1212 sdelta[i] = 0; 1213 } 1214 1215 for( j = 0; j < ndsts; i1 -= dst[j].channels(), j++ ) 1216 if( i1 < dst[j].channels() ) 1217 break; 1218 CV_Assert(i1 >= 0 && j < ndsts && dst[j].depth() == depth); 1219 tab[i*4+2] = (int)(j + nsrcs); tab[i*4+3] = (int)(i1*esz1); 1220 ddelta[i] = dst[j].channels(); 1221 } 1222 1223 NAryMatIterator it(arrays, ptrs, (int)(nsrcs + ndsts)); 1224 int total = (int)it.size, blocksize = std::min(total, (int)((BLOCK_SIZE + esz1-1)/esz1)); 1225 MixChannelsFunc func = getMixchFunc(depth); 1226 1227 for( i = 0; i < it.nplanes; i++, ++it ) 1228 { 1229 for( k = 0; k < npairs; k++ ) 1230 { 1231 srcs[k] = ptrs[tab[k*4]] + tab[k*4+1]; 1232 dsts[k] = ptrs[tab[k*4+2]] + tab[k*4+3]; 1233 } 1234 1235 for( int t = 0; t < total; t += blocksize ) 1236 { 1237 int bsz = std::min(total - t, blocksize); 1238 func( srcs, sdelta, dsts, ddelta, bsz, (int)npairs ); 1239 1240 if( t + blocksize < total ) 1241 for( k = 0; k < npairs; k++ ) 1242 { 1243 srcs[k] += blocksize*sdelta[k]*esz1; 1244 dsts[k] += blocksize*ddelta[k]*esz1; 1245 } 1246 } 1247 } 1248 } 1249 1250 #ifdef HAVE_OPENCL 1251 1252 namespace cv { 1253 1254 static void getUMatIndex(const std::vector<UMat> & um, int cn, int & idx, int & cnidx) 1255 { 1256 int totalChannels = 0; 1257 for (size_t i = 0, size = um.size(); i < size; ++i) 1258 { 1259 int ccn = um[i].channels(); 1260 totalChannels += ccn; 1261 1262 if (totalChannels == cn) 1263 { 1264 idx = (int)(i + 1); 1265 cnidx = 0; 1266 return; 1267 } 1268 else if (totalChannels > cn) 1269 { 1270 idx = (int)i; 1271 cnidx = i == 0 ? cn : (cn - totalChannels + ccn); 1272 return; 1273 } 1274 } 1275 1276 idx = cnidx = -1; 1277 } 1278 1279 static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _dst, 1280 const int* fromTo, size_t npairs) 1281 { 1282 std::vector<UMat> src, dst; 1283 _src.getUMatVector(src); 1284 _dst.getUMatVector(dst); 1285 1286 size_t nsrc = src.size(), ndst = dst.size(); 1287 CV_Assert(nsrc > 0 && ndst > 0); 1288 1289 Size size = src[0].size(); 1290 int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth), 1291 rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; 1292 1293 for (size_t i = 1, ssize = src.size(); i < ssize; ++i) 1294 CV_Assert(src[i].size() == size && src[i].depth() == depth); 1295 for (size_t i = 0, dsize = dst.size(); i < dsize; ++i) 1296 CV_Assert(dst[i].size() == size && dst[i].depth() == depth); 1297 1298 String declsrc, decldst, declproc, declcn, indexdecl; 1299 std::vector<UMat> srcargs(npairs), dstargs(npairs); 1300 1301 for (size_t i = 0; i < npairs; ++i) 1302 { 1303 int scn = fromTo[i<<1], dcn = fromTo[(i<<1) + 1]; 1304 int src_idx, src_cnidx, dst_idx, dst_cnidx; 1305 1306 getUMatIndex(src, scn, src_idx, src_cnidx); 1307 getUMatIndex(dst, dcn, dst_idx, dst_cnidx); 1308 1309 CV_Assert(dst_idx >= 0 && src_idx >= 0); 1310 1311 srcargs[i] = src[src_idx]; 1312 srcargs[i].offset += src_cnidx * esz; 1313 1314 dstargs[i] = dst[dst_idx]; 1315 dstargs[i].offset += dst_cnidx * esz; 1316 1317 declsrc += format("DECLARE_INPUT_MAT(%d)", i); 1318 decldst += format("DECLARE_OUTPUT_MAT(%d)", i); 1319 indexdecl += format("DECLARE_INDEX(%d)", i); 1320 declproc += format("PROCESS_ELEM(%d)", i); 1321 declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels()); 1322 } 1323 1324 ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc, 1325 format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s" 1326 " -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s", 1327 ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(), 1328 declproc.c_str(), indexdecl.c_str(), declcn.c_str())); 1329 if (k.empty()) 1330 return false; 1331 1332 int argindex = 0; 1333 for (size_t i = 0; i < npairs; ++i) 1334 argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i])); 1335 for (size_t i = 0; i < npairs; ++i) 1336 argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i])); 1337 argindex = k.set(argindex, size.height); 1338 argindex = k.set(argindex, size.width); 1339 k.set(argindex, rowsPerWI); 1340 1341 size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI }; 1342 return k.run(2, globalsize, NULL, false); 1343 } 1344 1345 } 1346 1347 #endif 1348 1349 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst, 1350 const int* fromTo, size_t npairs) 1351 { 1352 if (npairs == 0 || fromTo == NULL) 1353 return; 1354 1355 CV_OCL_RUN(dst.isUMatVector(), 1356 ocl_mixChannels(src, dst, fromTo, npairs)) 1357 1358 bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT && 1359 src.kind() != _InputArray::STD_VECTOR_VECTOR && 1360 src.kind() != _InputArray::STD_VECTOR_UMAT; 1361 bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT && 1362 dst.kind() != _InputArray::STD_VECTOR_VECTOR && 1363 dst.kind() != _InputArray::STD_VECTOR_UMAT; 1364 int i; 1365 int nsrc = src_is_mat ? 1 : (int)src.total(); 1366 int ndst = dst_is_mat ? 1 : (int)dst.total(); 1367 1368 CV_Assert(nsrc > 0 && ndst > 0); 1369 cv::AutoBuffer<Mat> _buf(nsrc + ndst); 1370 Mat* buf = _buf; 1371 for( i = 0; i < nsrc; i++ ) 1372 buf[i] = src.getMat(src_is_mat ? -1 : i); 1373 for( i = 0; i < ndst; i++ ) 1374 buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i); 1375 mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, fromTo, npairs); 1376 } 1377 1378 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst, 1379 const std::vector<int>& fromTo) 1380 { 1381 if (fromTo.empty()) 1382 return; 1383 1384 CV_OCL_RUN(dst.isUMatVector(), 1385 ocl_mixChannels(src, dst, &fromTo[0], fromTo.size()>>1)) 1386 1387 bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT && 1388 src.kind() != _InputArray::STD_VECTOR_VECTOR && 1389 src.kind() != _InputArray::STD_VECTOR_UMAT; 1390 bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT && 1391 dst.kind() != _InputArray::STD_VECTOR_VECTOR && 1392 dst.kind() != _InputArray::STD_VECTOR_UMAT; 1393 int i; 1394 int nsrc = src_is_mat ? 1 : (int)src.total(); 1395 int ndst = dst_is_mat ? 1 : (int)dst.total(); 1396 1397 CV_Assert(fromTo.size()%2 == 0 && nsrc > 0 && ndst > 0); 1398 cv::AutoBuffer<Mat> _buf(nsrc + ndst); 1399 Mat* buf = _buf; 1400 for( i = 0; i < nsrc; i++ ) 1401 buf[i] = src.getMat(src_is_mat ? -1 : i); 1402 for( i = 0; i < ndst; i++ ) 1403 buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i); 1404 mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, &fromTo[0], fromTo.size()/2); 1405 } 1406 1407 void cv::extractChannel(InputArray _src, OutputArray _dst, int coi) 1408 { 1409 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 1410 CV_Assert( 0 <= coi && coi < cn ); 1411 int ch[] = { coi, 0 }; 1412 1413 if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat()) 1414 { 1415 UMat src = _src.getUMat(); 1416 _dst.create(src.dims, &src.size[0], depth); 1417 UMat dst = _dst.getUMat(); 1418 mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1); 1419 return; 1420 } 1421 1422 Mat src = _src.getMat(); 1423 _dst.create(src.dims, &src.size[0], depth); 1424 Mat dst = _dst.getMat(); 1425 mixChannels(&src, 1, &dst, 1, ch, 1); 1426 } 1427 1428 void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi) 1429 { 1430 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype); 1431 int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype); 1432 CV_Assert( _src.sameSize(_dst) && sdepth == ddepth ); 1433 CV_Assert( 0 <= coi && coi < dcn && scn == 1 ); 1434 1435 int ch[] = { 0, coi }; 1436 if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat()) 1437 { 1438 UMat src = _src.getUMat(), dst = _dst.getUMat(); 1439 mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1); 1440 return; 1441 } 1442 1443 Mat src = _src.getMat(), dst = _dst.getMat(); 1444 mixChannels(&src, 1, &dst, 1, ch, 1); 1445 } 1446 1447 /****************************************************************************************\ 1448 * convertScale[Abs] * 1449 \****************************************************************************************/ 1450 1451 namespace cv 1452 { 1453 1454 template<typename T, typename DT, typename WT> 1455 struct cvtScaleAbs_SIMD 1456 { 1457 int operator () (const T *, DT *, int, WT, WT) const 1458 { 1459 return 0; 1460 } 1461 }; 1462 1463 #if CV_SSE2 1464 1465 template <> 1466 struct cvtScaleAbs_SIMD<uchar, uchar, float> 1467 { 1468 int operator () (const uchar * src, uchar * dst, int width, 1469 float scale, float shift) const 1470 { 1471 int x = 0; 1472 1473 if (USE_SSE2) 1474 { 1475 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 1476 v_zero_f = _mm_setzero_ps(); 1477 __m128i v_zero_i = _mm_setzero_si128(); 1478 1479 for ( ; x <= width - 16; x += 16) 1480 { 1481 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 1482 __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i); 1483 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift); 1484 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 1485 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift); 1486 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 1487 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift); 1488 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3); 1489 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift); 1490 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4); 1491 1492 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), 1493 _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4))); 1494 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i); 1495 } 1496 } 1497 1498 return x; 1499 } 1500 }; 1501 1502 template <> 1503 struct cvtScaleAbs_SIMD<schar, uchar, float> 1504 { 1505 int operator () (const schar * src, uchar * dst, int width, 1506 float scale, float shift) const 1507 { 1508 int x = 0; 1509 1510 if (USE_SSE2) 1511 { 1512 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 1513 v_zero_f = _mm_setzero_ps(); 1514 __m128i v_zero_i = _mm_setzero_si128(); 1515 1516 for ( ; x <= width - 16; x += 16) 1517 { 1518 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 1519 __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8), 1520 v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8); 1521 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( 1522 _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); 1523 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 1524 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( 1525 _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); 1526 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 1527 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( 1528 _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); 1529 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3); 1530 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( 1531 _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); 1532 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4); 1533 1534 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), 1535 _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4))); 1536 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i); 1537 } 1538 } 1539 1540 return x; 1541 } 1542 }; 1543 1544 template <> 1545 struct cvtScaleAbs_SIMD<ushort, uchar, float> 1546 { 1547 int operator () (const ushort * src, uchar * dst, int width, 1548 float scale, float shift) const 1549 { 1550 int x = 0; 1551 1552 if (USE_SSE2) 1553 { 1554 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 1555 v_zero_f = _mm_setzero_ps(); 1556 __m128i v_zero_i = _mm_setzero_si128(); 1557 1558 for ( ; x <= width - 8; x += 8) 1559 { 1560 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 1561 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift); 1562 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 1563 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift); 1564 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 1565 1566 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i); 1567 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i); 1568 } 1569 } 1570 1571 return x; 1572 } 1573 }; 1574 1575 template <> 1576 struct cvtScaleAbs_SIMD<short, uchar, float> 1577 { 1578 int operator () (const short * src, uchar * dst, int width, 1579 float scale, float shift) const 1580 { 1581 int x = 0; 1582 1583 if (USE_SSE2) 1584 { 1585 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 1586 v_zero_f = _mm_setzero_ps(); 1587 __m128i v_zero_i = _mm_setzero_si128(); 1588 1589 for ( ; x <= width - 8; x += 8) 1590 { 1591 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 1592 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift); 1593 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 1594 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift); 1595 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 1596 1597 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i); 1598 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i); 1599 } 1600 } 1601 1602 return x; 1603 } 1604 }; 1605 1606 template <> 1607 struct cvtScaleAbs_SIMD<int, uchar, float> 1608 { 1609 int operator () (const int * src, uchar * dst, int width, 1610 float scale, float shift) const 1611 { 1612 int x = 0; 1613 1614 if (USE_SSE2) 1615 { 1616 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 1617 v_zero_f = _mm_setzero_ps(); 1618 __m128i v_zero_i = _mm_setzero_si128(); 1619 1620 for ( ; x <= width - 8; x += 4) 1621 { 1622 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); 1623 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 1624 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 1625 1626 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i); 1627 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i); 1628 } 1629 } 1630 1631 return x; 1632 } 1633 }; 1634 1635 template <> 1636 struct cvtScaleAbs_SIMD<float, uchar, float> 1637 { 1638 int operator () (const float * src, uchar * dst, int width, 1639 float scale, float shift) const 1640 { 1641 int x = 0; 1642 1643 if (USE_SSE2) 1644 { 1645 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 1646 v_zero_f = _mm_setzero_ps(); 1647 __m128i v_zero_i = _mm_setzero_si128(); 1648 1649 for ( ; x <= width - 8; x += 4) 1650 { 1651 __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift); 1652 v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst); 1653 1654 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i); 1655 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i)); 1656 } 1657 } 1658 1659 return x; 1660 } 1661 }; 1662 1663 template <> 1664 struct cvtScaleAbs_SIMD<double, uchar, float> 1665 { 1666 int operator () (const double * src, uchar * dst, int width, 1667 float scale, float shift) const 1668 { 1669 int x = 0; 1670 1671 if (USE_SSE2) 1672 { 1673 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), 1674 v_zero_f = _mm_setzero_ps(); 1675 __m128i v_zero_i = _mm_setzero_si128(); 1676 1677 for ( ; x <= width - 8; x += 8) 1678 { 1679 __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 1680 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 1681 __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 1682 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 1683 1684 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift); 1685 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); 1686 1687 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift); 1688 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); 1689 1690 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), 1691 _mm_cvtps_epi32(v_dst2)); 1692 1693 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i)); 1694 } 1695 } 1696 1697 return x; 1698 } 1699 }; 1700 1701 #elif CV_NEON 1702 1703 template <> 1704 struct cvtScaleAbs_SIMD<uchar, uchar, float> 1705 { 1706 int operator () (const uchar * src, uchar * dst, int width, 1707 float scale, float shift) const 1708 { 1709 int x = 0; 1710 float32x4_t v_shift = vdupq_n_f32(shift); 1711 1712 for ( ; x <= width - 16; x += 16) 1713 { 1714 uint8x16_t v_src = vld1q_u8(src + x); 1715 uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src)); 1716 1717 uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half)); 1718 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); 1719 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 1720 1721 v_quat = vmovl_u16(vget_high_u16(v_half)); 1722 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); 1723 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 1724 1725 v_half = vmovl_u8(vget_high_u8(v_src)); 1726 1727 v_quat = vmovl_u16(vget_low_u16(v_half)); 1728 float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); 1729 v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift)); 1730 1731 v_quat = vmovl_u16(vget_high_u16(v_half)); 1732 float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale); 1733 v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift)); 1734 1735 uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), 1736 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); 1737 uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)), 1738 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3))); 1739 1740 vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1))); 1741 } 1742 1743 return x; 1744 } 1745 }; 1746 1747 template <> 1748 struct cvtScaleAbs_SIMD<schar, uchar, float> 1749 { 1750 int operator () (const schar * src, uchar * dst, int width, 1751 float scale, float shift) const 1752 { 1753 int x = 0; 1754 float32x4_t v_shift = vdupq_n_f32(shift); 1755 1756 for ( ; x <= width - 16; x += 16) 1757 { 1758 int8x16_t v_src = vld1q_s8(src + x); 1759 int16x8_t v_half = vmovl_s8(vget_low_s8(v_src)); 1760 1761 int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half)); 1762 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); 1763 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 1764 1765 v_quat = vmovl_s16(vget_high_s16(v_half)); 1766 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); 1767 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 1768 1769 v_half = vmovl_s8(vget_high_s8(v_src)); 1770 1771 v_quat = vmovl_s16(vget_low_s16(v_half)); 1772 float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); 1773 v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift)); 1774 1775 v_quat = vmovl_s16(vget_high_s16(v_half)); 1776 float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale); 1777 v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift)); 1778 1779 uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), 1780 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); 1781 uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)), 1782 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3))); 1783 1784 vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1))); 1785 } 1786 1787 return x; 1788 } 1789 }; 1790 1791 template <> 1792 struct cvtScaleAbs_SIMD<ushort, uchar, float> 1793 { 1794 int operator () (const ushort * src, uchar * dst, int width, 1795 float scale, float shift) const 1796 { 1797 int x = 0; 1798 float32x4_t v_shift = vdupq_n_f32(shift); 1799 1800 for ( ; x <= width - 8; x += 8) 1801 { 1802 uint16x8_t v_src = vld1q_u16(src + x); 1803 1804 uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src)); 1805 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale); 1806 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 1807 1808 v_half = vmovl_u16(vget_high_u16(v_src)); 1809 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale); 1810 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 1811 1812 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), 1813 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); 1814 1815 vst1_u8(dst + x, vqmovn_u16(v_dst)); 1816 } 1817 1818 return x; 1819 } 1820 }; 1821 1822 template <> 1823 struct cvtScaleAbs_SIMD<short, uchar, float> 1824 { 1825 int operator () (const short * src, uchar * dst, int width, 1826 float scale, float shift) const 1827 { 1828 int x = 0; 1829 float32x4_t v_shift = vdupq_n_f32(shift); 1830 1831 for ( ; x <= width - 8; x += 8) 1832 { 1833 int16x8_t v_src = vld1q_s16(src + x); 1834 1835 int32x4_t v_half = vmovl_s16(vget_low_s16(v_src)); 1836 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale); 1837 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 1838 1839 v_half = vmovl_s16(vget_high_s16(v_src)); 1840 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale); 1841 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 1842 1843 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)), 1844 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1))); 1845 1846 vst1_u8(dst + x, vqmovn_u16(v_dst)); 1847 } 1848 1849 return x; 1850 } 1851 }; 1852 1853 template <> 1854 struct cvtScaleAbs_SIMD<int, uchar, float> 1855 { 1856 int operator () (const int * src, uchar * dst, int width, 1857 float scale, float shift) const 1858 { 1859 int x = 0; 1860 float32x4_t v_shift = vdupq_n_f32(shift); 1861 1862 for ( ; x <= width - 8; x += 8) 1863 { 1864 float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale); 1865 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 1866 uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)); 1867 1868 float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale); 1869 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 1870 uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)); 1871 1872 uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1); 1873 vst1_u8(dst + x, vqmovn_u16(v_dst)); 1874 } 1875 1876 return x; 1877 } 1878 }; 1879 1880 template <> 1881 struct cvtScaleAbs_SIMD<float, uchar, float> 1882 { 1883 int operator () (const float * src, uchar * dst, int width, 1884 float scale, float shift) const 1885 { 1886 int x = 0; 1887 float32x4_t v_shift = vdupq_n_f32(shift); 1888 1889 for ( ; x <= width - 8; x += 8) 1890 { 1891 float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale); 1892 v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); 1893 uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)); 1894 1895 float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale); 1896 v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); 1897 uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)); 1898 1899 uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1); 1900 vst1_u8(dst + x, vqmovn_u16(v_dst)); 1901 } 1902 1903 return x; 1904 } 1905 }; 1906 1907 #endif 1908 1909 template<typename T, typename DT, typename WT> static void 1910 cvtScaleAbs_( const T* src, size_t sstep, 1911 DT* dst, size_t dstep, Size size, 1912 WT scale, WT shift ) 1913 { 1914 sstep /= sizeof(src[0]); 1915 dstep /= sizeof(dst[0]); 1916 cvtScaleAbs_SIMD<T, DT, WT> vop; 1917 1918 for( ; size.height--; src += sstep, dst += dstep ) 1919 { 1920 int x = vop(src, dst, size.width, scale, shift); 1921 1922 #if CV_ENABLE_UNROLLED 1923 for( ; x <= size.width - 4; x += 4 ) 1924 { 1925 DT t0, t1; 1926 t0 = saturate_cast<DT>(std::abs(src[x]*scale + shift)); 1927 t1 = saturate_cast<DT>(std::abs(src[x+1]*scale + shift)); 1928 dst[x] = t0; dst[x+1] = t1; 1929 t0 = saturate_cast<DT>(std::abs(src[x+2]*scale + shift)); 1930 t1 = saturate_cast<DT>(std::abs(src[x+3]*scale + shift)); 1931 dst[x+2] = t0; dst[x+3] = t1; 1932 } 1933 #endif 1934 for( ; x < size.width; x++ ) 1935 dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift)); 1936 } 1937 } 1938 1939 template <typename T, typename DT, typename WT> 1940 struct cvtScale_SIMD 1941 { 1942 int operator () (const T *, DT *, int, WT, WT) const 1943 { 1944 return 0; 1945 } 1946 }; 1947 1948 #if CV_SSE2 1949 1950 // from uchar 1951 1952 template <> 1953 struct cvtScale_SIMD<uchar, uchar, float> 1954 { 1955 int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const 1956 { 1957 int x = 0; 1958 1959 if (!USE_SSE2) 1960 return x; 1961 1962 __m128i v_zero = _mm_setzero_si128(); 1963 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 1964 1965 for ( ; x <= width - 8; x += 8) 1966 { 1967 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 1968 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 1969 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 1970 1971 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 1972 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 1973 1974 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 1975 _mm_cvtps_epi32(v_dst_1)); 1976 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 1977 } 1978 1979 return x; 1980 } 1981 }; 1982 1983 template <> 1984 struct cvtScale_SIMD<uchar, schar, float> 1985 { 1986 int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const 1987 { 1988 int x = 0; 1989 1990 if (!USE_SSE2) 1991 return x; 1992 1993 __m128i v_zero = _mm_setzero_si128(); 1994 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 1995 1996 for ( ; x <= width - 8; x += 8) 1997 { 1998 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 1999 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2000 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2001 2002 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2003 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2004 2005 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2006 _mm_cvtps_epi32(v_dst_1)); 2007 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 2008 } 2009 2010 return x; 2011 } 2012 }; 2013 2014 #if CV_SSE4_1 2015 2016 template <> 2017 struct cvtScale_SIMD<uchar, ushort, float> 2018 { 2019 cvtScale_SIMD() 2020 { 2021 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 2022 } 2023 2024 int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const 2025 { 2026 int x = 0; 2027 2028 if (!haveSSE) 2029 return x; 2030 2031 __m128i v_zero = _mm_setzero_si128(); 2032 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2033 2034 for ( ; x <= width - 8; x += 8) 2035 { 2036 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 2037 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2038 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2039 2040 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2041 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2042 2043 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 2044 _mm_cvtps_epi32(v_dst_1)); 2045 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 2046 } 2047 2048 return x; 2049 } 2050 2051 bool haveSSE; 2052 }; 2053 2054 #endif 2055 2056 template <> 2057 struct cvtScale_SIMD<uchar, short, float> 2058 { 2059 int operator () (const uchar * src, short * dst, int width, float scale, float shift) const 2060 { 2061 int x = 0; 2062 2063 if (!USE_SSE2) 2064 return x; 2065 2066 __m128i v_zero = _mm_setzero_si128(); 2067 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2068 2069 for ( ; x <= width - 8; x += 8) 2070 { 2071 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 2072 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2073 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2074 2075 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2076 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2077 2078 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2079 _mm_cvtps_epi32(v_dst_1)); 2080 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 2081 } 2082 2083 return x; 2084 } 2085 }; 2086 2087 template <> 2088 struct cvtScale_SIMD<uchar, int, float> 2089 { 2090 int operator () (const uchar * src, int * dst, int width, float scale, float shift) const 2091 { 2092 int x = 0; 2093 2094 if (!USE_SSE2) 2095 return x; 2096 2097 __m128i v_zero = _mm_setzero_si128(); 2098 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2099 2100 for ( ; x <= width - 8; x += 8) 2101 { 2102 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 2103 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2104 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2105 2106 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2107 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2108 2109 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 2110 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 2111 } 2112 2113 return x; 2114 } 2115 }; 2116 2117 template <> 2118 struct cvtScale_SIMD<uchar, float, float> 2119 { 2120 int operator () (const uchar * src, float * dst, int width, float scale, float shift) const 2121 { 2122 int x = 0; 2123 2124 if (!USE_SSE2) 2125 return x; 2126 2127 __m128i v_zero = _mm_setzero_si128(); 2128 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2129 2130 for ( ; x <= width - 8; x += 8) 2131 { 2132 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 2133 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2134 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2135 2136 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2137 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2138 2139 _mm_storeu_ps(dst + x, v_dst_0); 2140 _mm_storeu_ps(dst + x + 4, v_dst_1); 2141 } 2142 2143 return x; 2144 } 2145 }; 2146 2147 template <> 2148 struct cvtScale_SIMD<uchar, double, double> 2149 { 2150 int operator () (const uchar * src, double * dst, int width, double scale, double shift) const 2151 { 2152 int x = 0; 2153 2154 if (!USE_SSE2) 2155 return x; 2156 2157 __m128i v_zero = _mm_setzero_si128(); 2158 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 2159 2160 for ( ; x <= width - 8; x += 8) 2161 { 2162 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); 2163 2164 __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); 2165 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 2166 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 2167 _mm_storeu_pd(dst + x, v_dst_0); 2168 _mm_storeu_pd(dst + x + 2, v_dst_1); 2169 2170 v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); 2171 v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 2172 v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 2173 _mm_storeu_pd(dst + x + 4, v_dst_0); 2174 _mm_storeu_pd(dst + x + 6, v_dst_1); 2175 } 2176 2177 return x; 2178 } 2179 }; 2180 2181 // from schar 2182 2183 template <> 2184 struct cvtScale_SIMD<schar, uchar, float> 2185 { 2186 int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const 2187 { 2188 int x = 0; 2189 2190 if (!USE_SSE2) 2191 return x; 2192 2193 __m128i v_zero = _mm_setzero_si128(); 2194 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2195 2196 for ( ; x <= width - 8; x += 8) 2197 { 2198 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 2199 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2200 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2201 2202 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2203 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2204 2205 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2206 _mm_cvtps_epi32(v_dst_1)); 2207 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 2208 } 2209 2210 return x; 2211 } 2212 }; 2213 2214 template <> 2215 struct cvtScale_SIMD<schar, schar, float> 2216 { 2217 int operator () (const schar * src, schar * dst, int width, float scale, float shift) const 2218 { 2219 int x = 0; 2220 2221 if (!USE_SSE2) 2222 return x; 2223 2224 __m128i v_zero = _mm_setzero_si128(); 2225 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2226 2227 for ( ; x <= width - 8; x += 8) 2228 { 2229 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 2230 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2231 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2232 2233 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2234 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2235 2236 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2237 _mm_cvtps_epi32(v_dst_1)); 2238 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 2239 } 2240 2241 return x; 2242 } 2243 }; 2244 2245 #if CV_SSE4_1 2246 2247 template <> 2248 struct cvtScale_SIMD<schar, ushort, float> 2249 { 2250 cvtScale_SIMD() 2251 { 2252 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 2253 } 2254 2255 int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const 2256 { 2257 int x = 0; 2258 2259 if (!haveSSE) 2260 return x; 2261 2262 __m128i v_zero = _mm_setzero_si128(); 2263 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2264 2265 for ( ; x <= width - 8; x += 8) 2266 { 2267 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 2268 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2269 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2270 2271 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2272 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2273 2274 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 2275 _mm_cvtps_epi32(v_dst_1)); 2276 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 2277 } 2278 2279 return x; 2280 } 2281 2282 bool haveSSE; 2283 }; 2284 2285 #endif 2286 2287 template <> 2288 struct cvtScale_SIMD<schar, short, float> 2289 { 2290 int operator () (const schar * src, short * dst, int width, float scale, float shift) const 2291 { 2292 int x = 0; 2293 2294 if (!USE_SSE2) 2295 return x; 2296 2297 __m128i v_zero = _mm_setzero_si128(); 2298 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2299 2300 for ( ; x <= width - 8; x += 8) 2301 { 2302 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 2303 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2304 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2305 2306 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2307 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2308 2309 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2310 _mm_cvtps_epi32(v_dst_1)); 2311 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 2312 } 2313 2314 return x; 2315 } 2316 }; 2317 2318 template <> 2319 struct cvtScale_SIMD<schar, int, float> 2320 { 2321 int operator () (const schar * src, int * dst, int width, float scale, float shift) const 2322 { 2323 int x = 0; 2324 2325 if (!USE_SSE2) 2326 return x; 2327 2328 __m128i v_zero = _mm_setzero_si128(); 2329 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2330 2331 for ( ; x <= width - 8; x += 8) 2332 { 2333 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 2334 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2335 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2336 2337 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2338 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2339 2340 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 2341 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 2342 } 2343 2344 return x; 2345 } 2346 }; 2347 2348 template <> 2349 struct cvtScale_SIMD<schar, float, float> 2350 { 2351 int operator () (const schar * src, float * dst, int width, float scale, float shift) const 2352 { 2353 int x = 0; 2354 2355 if (!USE_SSE2) 2356 return x; 2357 2358 __m128i v_zero = _mm_setzero_si128(); 2359 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2360 2361 for ( ; x <= width - 8; x += 8) 2362 { 2363 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); 2364 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2365 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2366 2367 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2368 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2369 2370 _mm_storeu_ps(dst + x, v_dst_0); 2371 _mm_storeu_ps(dst + x + 4, v_dst_1); 2372 } 2373 2374 return x; 2375 } 2376 }; 2377 2378 template <> 2379 struct cvtScale_SIMD<schar, double, double> 2380 { 2381 int operator () (const schar * src, double * dst, int width, double scale, double shift) const 2382 { 2383 int x = 0; 2384 2385 if (!USE_SSE2) 2386 return x; 2387 2388 __m128i v_zero = _mm_setzero_si128(); 2389 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 2390 2391 for ( ; x <= width - 8; x += 8) 2392 { 2393 __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))); 2394 v_src = _mm_srai_epi16(v_src, 8); 2395 2396 __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); 2397 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 2398 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 2399 _mm_storeu_pd(dst + x, v_dst_0); 2400 _mm_storeu_pd(dst + x + 2, v_dst_1); 2401 2402 v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); 2403 v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 2404 v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 2405 _mm_storeu_pd(dst + x + 4, v_dst_0); 2406 _mm_storeu_pd(dst + x + 6, v_dst_1); 2407 } 2408 2409 return x; 2410 } 2411 }; 2412 2413 // from ushort 2414 2415 template <> 2416 struct cvtScale_SIMD<ushort, uchar, float> 2417 { 2418 int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const 2419 { 2420 int x = 0; 2421 2422 if (!USE_SSE2) 2423 return x; 2424 2425 __m128i v_zero = _mm_setzero_si128(); 2426 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2427 2428 for ( ; x <= width - 8; x += 8) 2429 { 2430 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2431 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2432 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2433 2434 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2435 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2436 2437 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2438 _mm_cvtps_epi32(v_dst_1)); 2439 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 2440 } 2441 2442 return x; 2443 } 2444 }; 2445 2446 template <> 2447 struct cvtScale_SIMD<ushort, schar, float> 2448 { 2449 int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const 2450 { 2451 int x = 0; 2452 2453 if (!USE_SSE2) 2454 return x; 2455 2456 __m128i v_zero = _mm_setzero_si128(); 2457 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2458 2459 for ( ; x <= width - 8; x += 8) 2460 { 2461 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2462 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2463 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2464 2465 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2466 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2467 2468 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2469 _mm_cvtps_epi32(v_dst_1)); 2470 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 2471 } 2472 2473 return x; 2474 } 2475 }; 2476 2477 #if CV_SSE4_1 2478 2479 template <> 2480 struct cvtScale_SIMD<ushort, ushort, float> 2481 { 2482 cvtScale_SIMD() 2483 { 2484 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 2485 } 2486 2487 int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const 2488 { 2489 int x = 0; 2490 2491 if (!haveSSE) 2492 return x; 2493 2494 __m128i v_zero = _mm_setzero_si128(); 2495 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2496 2497 for ( ; x <= width - 8; x += 8) 2498 { 2499 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2500 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2501 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2502 2503 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2504 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2505 2506 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 2507 _mm_cvtps_epi32(v_dst_1)); 2508 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 2509 } 2510 2511 return x; 2512 } 2513 2514 bool haveSSE; 2515 }; 2516 2517 #endif 2518 2519 template <> 2520 struct cvtScale_SIMD<ushort, short, float> 2521 { 2522 int operator () (const ushort * src, short * dst, int width, float scale, float shift) const 2523 { 2524 int x = 0; 2525 2526 if (!USE_SSE2) 2527 return x; 2528 2529 __m128i v_zero = _mm_setzero_si128(); 2530 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2531 2532 for ( ; x <= width - 8; x += 8) 2533 { 2534 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2535 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2536 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2537 2538 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2539 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2540 2541 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2542 _mm_cvtps_epi32(v_dst_1)); 2543 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 2544 } 2545 2546 return x; 2547 } 2548 }; 2549 2550 template <> 2551 struct cvtScale_SIMD<ushort, int, float> 2552 { 2553 int operator () (const ushort * src, int * dst, int width, float scale, float shift) const 2554 { 2555 int x = 0; 2556 2557 if (!USE_SSE2) 2558 return x; 2559 2560 __m128i v_zero = _mm_setzero_si128(); 2561 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2562 2563 for ( ; x <= width - 8; x += 8) 2564 { 2565 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2566 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2567 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2568 2569 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2570 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2571 2572 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 2573 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 2574 } 2575 2576 return x; 2577 } 2578 }; 2579 2580 template <> 2581 struct cvtScale_SIMD<ushort, float, float> 2582 { 2583 int operator () (const ushort * src, float * dst, int width, float scale, float shift) const 2584 { 2585 int x = 0; 2586 2587 if (!USE_SSE2) 2588 return x; 2589 2590 __m128i v_zero = _mm_setzero_si128(); 2591 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2592 2593 for ( ; x <= width - 8; x += 8) 2594 { 2595 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2596 __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); 2597 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2598 2599 v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); 2600 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2601 2602 _mm_storeu_ps(dst + x, v_dst_0); 2603 _mm_storeu_ps(dst + x + 4, v_dst_1); 2604 } 2605 2606 return x; 2607 } 2608 }; 2609 2610 template <> 2611 struct cvtScale_SIMD<ushort, double, double> 2612 { 2613 int operator () (const ushort * src, double * dst, int width, double scale, double shift) const 2614 { 2615 int x = 0; 2616 2617 if (!USE_SSE2) 2618 return x; 2619 2620 __m128i v_zero = _mm_setzero_si128(); 2621 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 2622 2623 for ( ; x <= width - 8; x += 8) 2624 { 2625 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2626 2627 __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); 2628 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 2629 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 2630 _mm_storeu_pd(dst + x, v_dst_0); 2631 _mm_storeu_pd(dst + x + 2, v_dst_1); 2632 2633 v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); 2634 v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 2635 v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 2636 _mm_storeu_pd(dst + x + 4, v_dst_0); 2637 _mm_storeu_pd(dst + x + 6, v_dst_1); 2638 } 2639 2640 return x; 2641 } 2642 }; 2643 2644 // from short 2645 2646 template <> 2647 struct cvtScale_SIMD<short, uchar, float> 2648 { 2649 int operator () (const short * src, uchar * dst, int width, float scale, float shift) const 2650 { 2651 int x = 0; 2652 2653 if (!USE_SSE2) 2654 return x; 2655 2656 __m128i v_zero = _mm_setzero_si128(); 2657 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2658 2659 for ( ; x <= width - 8; x += 8) 2660 { 2661 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2662 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2663 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2664 2665 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2666 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2667 2668 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2669 _mm_cvtps_epi32(v_dst_1)); 2670 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 2671 } 2672 2673 return x; 2674 } 2675 }; 2676 2677 template <> 2678 struct cvtScale_SIMD<short, schar, float> 2679 { 2680 int operator () (const short * src, schar * dst, int width, float scale, float shift) const 2681 { 2682 int x = 0; 2683 2684 if (!USE_SSE2) 2685 return x; 2686 2687 __m128i v_zero = _mm_setzero_si128(); 2688 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2689 2690 for ( ; x <= width - 8; x += 8) 2691 { 2692 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2693 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2694 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2695 2696 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2697 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2698 2699 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2700 _mm_cvtps_epi32(v_dst_1)); 2701 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 2702 } 2703 2704 return x; 2705 } 2706 }; 2707 2708 #if CV_SSE4_1 2709 2710 template <> 2711 struct cvtScale_SIMD<short, ushort, float> 2712 { 2713 cvtScale_SIMD() 2714 { 2715 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 2716 } 2717 2718 int operator () (const short * src, ushort * dst, int width, float scale, float shift) const 2719 { 2720 int x = 0; 2721 2722 if (!haveSSE) 2723 return x; 2724 2725 __m128i v_zero = _mm_setzero_si128(); 2726 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2727 2728 for ( ; x <= width - 8; x += 8) 2729 { 2730 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2731 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2732 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2733 2734 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2735 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2736 2737 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 2738 _mm_cvtps_epi32(v_dst_1)); 2739 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 2740 } 2741 2742 return x; 2743 } 2744 2745 bool haveSSE; 2746 }; 2747 2748 #endif 2749 2750 template <> 2751 struct cvtScale_SIMD<short, short, float> 2752 { 2753 int operator () (const short * src, short * dst, int width, float scale, float shift) const 2754 { 2755 int x = 0; 2756 2757 if (!USE_SSE2) 2758 return x; 2759 2760 __m128i v_zero = _mm_setzero_si128(); 2761 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2762 2763 for ( ; x <= width - 8; x += 8) 2764 { 2765 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2766 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2767 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2768 2769 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2770 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2771 2772 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2773 _mm_cvtps_epi32(v_dst_1)); 2774 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 2775 } 2776 2777 return x; 2778 } 2779 }; 2780 2781 template <> 2782 struct cvtScale_SIMD<short, int, float> 2783 { 2784 int operator () (const short * src, int * dst, int width, float scale, float shift) const 2785 { 2786 int x = 0; 2787 2788 if (!USE_SSE2) 2789 return x; 2790 2791 __m128i v_zero = _mm_setzero_si128(); 2792 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2793 2794 for ( ; x <= width - 8; x += 8) 2795 { 2796 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2797 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2798 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2799 2800 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2801 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2802 2803 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 2804 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 2805 } 2806 2807 return x; 2808 } 2809 }; 2810 2811 template <> 2812 struct cvtScale_SIMD<short, float, float> 2813 { 2814 int operator () (const short * src, float * dst, int width, float scale, float shift) const 2815 { 2816 int x = 0; 2817 2818 if (!USE_SSE2) 2819 return x; 2820 2821 __m128i v_zero = _mm_setzero_si128(); 2822 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2823 2824 for ( ; x <= width - 8; x += 8) 2825 { 2826 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2827 __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 2828 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2829 2830 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 2831 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); 2832 2833 _mm_storeu_ps(dst + x, v_dst_0); 2834 _mm_storeu_ps(dst + x + 4, v_dst_1); 2835 } 2836 2837 return x; 2838 } 2839 }; 2840 2841 template <> 2842 struct cvtScale_SIMD<short, double, double> 2843 { 2844 int operator () (const short * src, double * dst, int width, double scale, double shift) const 2845 { 2846 int x = 0; 2847 2848 if (!USE_SSE2) 2849 return x; 2850 2851 __m128i v_zero = _mm_setzero_si128(); 2852 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 2853 2854 for ( ; x <= width - 8; x += 8) 2855 { 2856 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2857 2858 __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); 2859 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 2860 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 2861 _mm_storeu_pd(dst + x, v_dst_0); 2862 _mm_storeu_pd(dst + x + 2, v_dst_1); 2863 2864 v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); 2865 v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); 2866 v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); 2867 _mm_storeu_pd(dst + x + 4, v_dst_0); 2868 _mm_storeu_pd(dst + x + 6, v_dst_1); 2869 } 2870 2871 return x; 2872 } 2873 }; 2874 2875 // from int 2876 2877 template <> 2878 struct cvtScale_SIMD<int, uchar, float> 2879 { 2880 int operator () (const int * src, uchar * dst, int width, float scale, float shift) const 2881 { 2882 int x = 0; 2883 2884 if (!USE_SSE2) 2885 return x; 2886 2887 __m128i v_zero = _mm_setzero_si128(); 2888 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2889 2890 for ( ; x <= width - 8; x += 8) 2891 { 2892 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2893 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 2894 2895 v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); 2896 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 2897 2898 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2899 _mm_cvtps_epi32(v_dst_1)); 2900 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 2901 } 2902 2903 return x; 2904 } 2905 }; 2906 2907 template <> 2908 struct cvtScale_SIMD<int, schar, float> 2909 { 2910 int operator () (const int * src, schar * dst, int width, float scale, float shift) const 2911 { 2912 int x = 0; 2913 2914 if (!USE_SSE2) 2915 return x; 2916 2917 __m128i v_zero = _mm_setzero_si128(); 2918 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2919 2920 for ( ; x <= width - 8; x += 8) 2921 { 2922 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2923 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 2924 2925 v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); 2926 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 2927 2928 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2929 _mm_cvtps_epi32(v_dst_1)); 2930 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 2931 } 2932 2933 return x; 2934 } 2935 }; 2936 2937 #if CV_SSE4_1 2938 2939 template <> 2940 struct cvtScale_SIMD<int, ushort, float> 2941 { 2942 cvtScale_SIMD() 2943 { 2944 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 2945 } 2946 2947 int operator () (const int * src, ushort * dst, int width, float scale, float shift) const 2948 { 2949 int x = 0; 2950 2951 if (!haveSSE) 2952 return x; 2953 2954 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2955 2956 for ( ; x <= width - 8; x += 8) 2957 { 2958 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2959 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 2960 2961 v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); 2962 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 2963 2964 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 2965 _mm_cvtps_epi32(v_dst_1)); 2966 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 2967 } 2968 2969 return x; 2970 } 2971 2972 bool haveSSE; 2973 }; 2974 2975 #endif 2976 2977 template <> 2978 struct cvtScale_SIMD<int, short, float> 2979 { 2980 int operator () (const int * src, short * dst, int width, float scale, float shift) const 2981 { 2982 int x = 0; 2983 2984 if (!USE_SSE2) 2985 return x; 2986 2987 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 2988 2989 for ( ; x <= width - 8; x += 8) 2990 { 2991 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 2992 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 2993 2994 v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); 2995 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); 2996 2997 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 2998 _mm_cvtps_epi32(v_dst_1)); 2999 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 3000 } 3001 3002 return x; 3003 } 3004 }; 3005 3006 template <> 3007 struct cvtScale_SIMD<int, int, double> 3008 { 3009 int operator () (const int * src, int * dst, int width, double scale, double shift) const 3010 { 3011 int x = 0; 3012 3013 if (!USE_SSE2) 3014 return x; 3015 3016 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 3017 3018 for ( ; x <= width - 4; x += 4) 3019 { 3020 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 3021 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 3022 3023 v_src = _mm_srli_si128(v_src, 8); 3024 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 3025 3026 __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)), 3027 _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1))); 3028 3029 _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); 3030 } 3031 3032 return x; 3033 } 3034 }; 3035 3036 template <> 3037 struct cvtScale_SIMD<int, float, double> 3038 { 3039 int operator () (const int * src, float * dst, int width, double scale, double shift) const 3040 { 3041 int x = 0; 3042 3043 if (!USE_SSE2) 3044 return x; 3045 3046 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 3047 3048 for ( ; x <= width - 4; x += 4) 3049 { 3050 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 3051 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 3052 3053 v_src = _mm_srli_si128(v_src, 8); 3054 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 3055 3056 _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0), 3057 _mm_cvtpd_ps(v_dst_1))); 3058 } 3059 3060 return x; 3061 } 3062 }; 3063 3064 template <> 3065 struct cvtScale_SIMD<int, double, double> 3066 { 3067 int operator () (const int * src, double * dst, int width, double scale, double shift) const 3068 { 3069 int x = 0; 3070 3071 if (!USE_SSE2) 3072 return x; 3073 3074 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 3075 3076 for ( ; x <= width - 4; x += 4) 3077 { 3078 __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); 3079 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 3080 3081 v_src = _mm_srli_si128(v_src, 8); 3082 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); 3083 3084 _mm_storeu_pd(dst + x, v_dst_0); 3085 _mm_storeu_pd(dst + x + 2, v_dst_1); 3086 } 3087 3088 return x; 3089 } 3090 }; 3091 3092 // from float 3093 3094 template <> 3095 struct cvtScale_SIMD<float, uchar, float> 3096 { 3097 int operator () (const float * src, uchar * dst, int width, float scale, float shift) const 3098 { 3099 int x = 0; 3100 3101 if (!USE_SSE2) 3102 return x; 3103 3104 __m128i v_zero = _mm_setzero_si128(); 3105 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3106 3107 for ( ; x <= width - 8; x += 8) 3108 { 3109 __m128 v_src = _mm_loadu_ps(src + x); 3110 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3111 3112 v_src = _mm_loadu_ps(src + x + 4); 3113 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3114 3115 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 3116 _mm_cvtps_epi32(v_dst_1)); 3117 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 3118 } 3119 3120 return x; 3121 } 3122 }; 3123 3124 template <> 3125 struct cvtScale_SIMD<float, schar, float> 3126 { 3127 int operator () (const float * src, schar * dst, int width, float scale, float shift) const 3128 { 3129 int x = 0; 3130 3131 if (!USE_SSE2) 3132 return x; 3133 3134 __m128i v_zero = _mm_setzero_si128(); 3135 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3136 3137 for ( ; x <= width - 8; x += 8) 3138 { 3139 __m128 v_src = _mm_loadu_ps(src + x); 3140 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3141 3142 v_src = _mm_loadu_ps(src + x + 4); 3143 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3144 3145 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 3146 _mm_cvtps_epi32(v_dst_1)); 3147 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 3148 } 3149 3150 return x; 3151 } 3152 }; 3153 3154 #if CV_SSE4_1 3155 3156 template <> 3157 struct cvtScale_SIMD<float, ushort, float> 3158 { 3159 cvtScale_SIMD() 3160 { 3161 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 3162 } 3163 3164 int operator () (const float * src, ushort * dst, int width, float scale, float shift) const 3165 { 3166 int x = 0; 3167 3168 if (!haveSSE) 3169 return x; 3170 3171 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3172 3173 for ( ; x <= width - 8; x += 8) 3174 { 3175 __m128 v_src = _mm_loadu_ps(src + x); 3176 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3177 3178 v_src = _mm_loadu_ps(src + x + 4); 3179 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3180 3181 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 3182 _mm_cvtps_epi32(v_dst_1)); 3183 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 3184 } 3185 3186 return x; 3187 } 3188 3189 bool haveSSE; 3190 }; 3191 3192 #endif 3193 3194 template <> 3195 struct cvtScale_SIMD<float, short, float> 3196 { 3197 int operator () (const float * src, short * dst, int width, float scale, float shift) const 3198 { 3199 int x = 0; 3200 3201 if (!USE_SSE2) 3202 return x; 3203 3204 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3205 3206 for ( ; x <= width - 8; x += 8) 3207 { 3208 __m128 v_src = _mm_loadu_ps(src + x); 3209 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3210 3211 v_src = _mm_loadu_ps(src + x + 4); 3212 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3213 3214 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 3215 _mm_cvtps_epi32(v_dst_1)); 3216 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 3217 } 3218 3219 return x; 3220 } 3221 }; 3222 3223 template <> 3224 struct cvtScale_SIMD<float, int, float> 3225 { 3226 int operator () (const float * src, int * dst, int width, float scale, float shift) const 3227 { 3228 int x = 0; 3229 3230 if (!USE_SSE2) 3231 return x; 3232 3233 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3234 3235 for ( ; x <= width - 8; x += 8) 3236 { 3237 __m128 v_src = _mm_loadu_ps(src + x); 3238 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3239 3240 v_src = _mm_loadu_ps(src + x + 4); 3241 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3242 3243 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); 3244 _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); 3245 } 3246 3247 return x; 3248 } 3249 }; 3250 3251 template <> 3252 struct cvtScale_SIMD<float, float, float> 3253 { 3254 int operator () (const float * src, float * dst, int width, float scale, float shift) const 3255 { 3256 int x = 0; 3257 3258 if (!USE_SSE2) 3259 return x; 3260 3261 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3262 3263 for ( ; x <= width - 4; x += 4) 3264 { 3265 __m128 v_src = _mm_loadu_ps(src + x); 3266 __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3267 _mm_storeu_ps(dst + x, v_dst); 3268 } 3269 3270 return x; 3271 } 3272 }; 3273 3274 template <> 3275 struct cvtScale_SIMD<float, double, double> 3276 { 3277 int operator () (const float * src, double * dst, int width, double scale, double shift) const 3278 { 3279 int x = 0; 3280 3281 if (!USE_SSE2) 3282 return x; 3283 3284 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 3285 3286 for ( ; x <= width - 4; x += 4) 3287 { 3288 __m128 v_src = _mm_loadu_ps(src + x); 3289 __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); 3290 v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); 3291 __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); 3292 3293 _mm_storeu_pd(dst + x, v_dst_0); 3294 _mm_storeu_pd(dst + x + 2, v_dst_1); 3295 } 3296 3297 return x; 3298 } 3299 }; 3300 3301 // from double 3302 3303 template <> 3304 struct cvtScale_SIMD<double, uchar, float> 3305 { 3306 int operator () (const double * src, uchar * dst, int width, float scale, float shift) const 3307 { 3308 int x = 0; 3309 3310 if (!USE_SSE2) 3311 return x; 3312 3313 __m128i v_zero = _mm_setzero_si128(); 3314 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3315 3316 for ( ; x <= width - 8; x += 8) 3317 { 3318 __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 3319 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 3320 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3321 3322 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 3323 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 3324 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3325 3326 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 3327 _mm_cvtps_epi32(v_dst_1)); 3328 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); 3329 } 3330 3331 return x; 3332 } 3333 }; 3334 3335 template <> 3336 struct cvtScale_SIMD<double, schar, float> 3337 { 3338 int operator () (const double * src, schar * dst, int width, float scale, float shift) const 3339 { 3340 int x = 0; 3341 3342 if (!USE_SSE2) 3343 return x; 3344 3345 __m128i v_zero = _mm_setzero_si128(); 3346 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3347 3348 for ( ; x <= width - 8; x += 8) 3349 { 3350 __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 3351 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 3352 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3353 3354 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 3355 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 3356 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3357 3358 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 3359 _mm_cvtps_epi32(v_dst_1)); 3360 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); 3361 } 3362 3363 return x; 3364 } 3365 }; 3366 3367 #if CV_SSE4_1 3368 3369 template <> 3370 struct cvtScale_SIMD<double, ushort, float> 3371 { 3372 cvtScale_SIMD() 3373 { 3374 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); 3375 } 3376 3377 int operator () (const double * src, ushort * dst, int width, float scale, float shift) const 3378 { 3379 int x = 0; 3380 3381 if (!haveSSE) 3382 return x; 3383 3384 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3385 3386 for ( ; x <= width - 8; x += 8) 3387 { 3388 __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 3389 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 3390 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3391 3392 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 3393 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 3394 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3395 3396 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), 3397 _mm_cvtps_epi32(v_dst_1)); 3398 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 3399 } 3400 3401 return x; 3402 } 3403 3404 bool haveSSE; 3405 }; 3406 3407 #endif 3408 3409 template <> 3410 struct cvtScale_SIMD<double, short, float> 3411 { 3412 int operator () (const double * src, short * dst, int width, float scale, float shift) const 3413 { 3414 int x = 0; 3415 3416 if (!USE_SSE2) 3417 return x; 3418 3419 __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); 3420 3421 for ( ; x <= width - 8; x += 8) 3422 { 3423 __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), 3424 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); 3425 __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3426 3427 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), 3428 _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); 3429 __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); 3430 3431 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), 3432 _mm_cvtps_epi32(v_dst_1)); 3433 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 3434 } 3435 3436 return x; 3437 } 3438 }; 3439 3440 template <> 3441 struct cvtScale_SIMD<double, int, double> 3442 { 3443 int operator () (const double * src, int * dst, int width, double scale, double shift) const 3444 { 3445 int x = 0; 3446 3447 if (!USE_SSE2) 3448 return x; 3449 3450 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 3451 3452 for ( ; x <= width - 4; x += 4) 3453 { 3454 __m128d v_src = _mm_loadu_pd(src + x); 3455 __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 3456 3457 v_src = _mm_loadu_pd(src + x + 2); 3458 __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 3459 3460 __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)), 3461 _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1))); 3462 3463 _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); 3464 } 3465 3466 return x; 3467 } 3468 }; 3469 3470 template <> 3471 struct cvtScale_SIMD<double, float, double> 3472 { 3473 int operator () (const double * src, float * dst, int width, double scale, double shift) const 3474 { 3475 int x = 0; 3476 3477 if (!USE_SSE2) 3478 return x; 3479 3480 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 3481 3482 for ( ; x <= width - 4; x += 4) 3483 { 3484 __m128d v_src = _mm_loadu_pd(src + x); 3485 __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 3486 3487 v_src = _mm_loadu_pd(src + x + 2); 3488 __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 3489 3490 __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0), 3491 _mm_cvtpd_ps(v_dst1)); 3492 3493 _mm_storeu_ps(dst + x, v_dst); 3494 } 3495 3496 return x; 3497 } 3498 }; 3499 3500 template <> 3501 struct cvtScale_SIMD<double, double, double> 3502 { 3503 int operator () (const double * src, double * dst, int width, double scale, double shift) const 3504 { 3505 int x = 0; 3506 3507 if (!USE_SSE2) 3508 return x; 3509 3510 __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); 3511 3512 for ( ; x <= width - 2; x += 2) 3513 { 3514 __m128d v_src = _mm_loadu_pd(src + x); 3515 __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); 3516 _mm_storeu_pd(dst + x, v_dst); 3517 } 3518 3519 return x; 3520 } 3521 }; 3522 3523 #elif CV_NEON 3524 3525 // from uchar 3526 3527 template <> 3528 struct cvtScale_SIMD<uchar, uchar, float> 3529 { 3530 int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const 3531 { 3532 int x = 0; 3533 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3534 3535 for ( ; x <= width - 8; x += 8) 3536 { 3537 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 3538 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3539 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3540 3541 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 3542 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 3543 vst1_u8(dst + x, vqmovn_u16(v_dst)); 3544 } 3545 3546 return x; 3547 } 3548 }; 3549 3550 template <> 3551 struct cvtScale_SIMD<uchar, schar, float> 3552 { 3553 int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const 3554 { 3555 int x = 0; 3556 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3557 3558 for ( ; x <= width - 8; x += 8) 3559 { 3560 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 3561 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3562 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3563 3564 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 3565 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 3566 vst1_s8(dst + x, vqmovn_s16(v_dst)); 3567 } 3568 3569 return x; 3570 } 3571 }; 3572 3573 template <> 3574 struct cvtScale_SIMD<uchar, ushort, float> 3575 { 3576 int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const 3577 { 3578 int x = 0; 3579 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3580 3581 for ( ; x <= width - 8; x += 8) 3582 { 3583 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 3584 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3585 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3586 3587 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 3588 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 3589 vst1q_u16(dst + x, v_dst); 3590 } 3591 3592 return x; 3593 } 3594 }; 3595 3596 template <> 3597 struct cvtScale_SIMD<uchar, short, float> 3598 { 3599 int operator () (const uchar * src, short * dst, int width, float scale, float shift) const 3600 { 3601 int x = 0; 3602 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3603 3604 for ( ; x <= width - 8; x += 8) 3605 { 3606 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 3607 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3608 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3609 3610 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 3611 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 3612 vst1q_s16(dst + x, v_dst); 3613 } 3614 3615 return x; 3616 } 3617 }; 3618 3619 template <> 3620 struct cvtScale_SIMD<uchar, int, float> 3621 { 3622 int operator () (const uchar * src, int * dst, int width, float scale, float shift) const 3623 { 3624 int x = 0; 3625 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3626 3627 for ( ; x <= width - 8; x += 8) 3628 { 3629 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 3630 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3631 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3632 3633 vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); 3634 vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); 3635 } 3636 3637 return x; 3638 } 3639 }; 3640 3641 template <> 3642 struct cvtScale_SIMD<uchar, float, float> 3643 { 3644 int operator () (const uchar * src, float * dst, int width, float scale, float shift) const 3645 { 3646 int x = 0; 3647 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3648 3649 for ( ; x <= width - 8; x += 8) 3650 { 3651 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 3652 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift)); 3653 vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift)); 3654 } 3655 3656 return x; 3657 } 3658 }; 3659 3660 // from schar 3661 3662 template <> 3663 struct cvtScale_SIMD<schar, uchar, float> 3664 { 3665 int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const 3666 { 3667 int x = 0; 3668 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3669 3670 for ( ; x <= width - 8; x += 8) 3671 { 3672 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 3673 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 3674 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 3675 3676 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 3677 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 3678 vst1_u8(dst + x, vqmovn_u16(v_dst)); 3679 } 3680 3681 return x; 3682 } 3683 }; 3684 3685 template <> 3686 struct cvtScale_SIMD<schar, schar, float> 3687 { 3688 int operator () (const schar * src, schar * dst, int width, float scale, float shift) const 3689 { 3690 int x = 0; 3691 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3692 3693 for ( ; x <= width - 8; x += 8) 3694 { 3695 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 3696 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 3697 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 3698 3699 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 3700 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 3701 vst1_s8(dst + x, vqmovn_s16(v_dst)); 3702 } 3703 3704 return x; 3705 } 3706 }; 3707 3708 template <> 3709 struct cvtScale_SIMD<schar, ushort, float> 3710 { 3711 int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const 3712 { 3713 int x = 0; 3714 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3715 3716 for ( ; x <= width - 8; x += 8) 3717 { 3718 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 3719 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 3720 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 3721 3722 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 3723 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 3724 vst1q_u16(dst + x, v_dst); 3725 } 3726 3727 return x; 3728 } 3729 }; 3730 3731 template <> 3732 struct cvtScale_SIMD<schar, short, float> 3733 { 3734 int operator () (const schar * src, short * dst, int width, float scale, float shift) const 3735 { 3736 int x = 0; 3737 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3738 3739 for ( ; x <= width - 8; x += 8) 3740 { 3741 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 3742 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 3743 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 3744 3745 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 3746 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 3747 vst1q_s16(dst + x, v_dst); 3748 } 3749 3750 return x; 3751 } 3752 }; 3753 3754 template <> 3755 struct cvtScale_SIMD<schar, int, float> 3756 { 3757 int operator () (const schar * src, int * dst, int width, float scale, float shift) const 3758 { 3759 int x = 0; 3760 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3761 3762 for ( ; x <= width - 8; x += 8) 3763 { 3764 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 3765 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 3766 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 3767 3768 vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); 3769 vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); 3770 } 3771 3772 return x; 3773 } 3774 }; 3775 3776 template <> 3777 struct cvtScale_SIMD<schar, float, float> 3778 { 3779 int operator () (const schar * src, float * dst, int width, float scale, float shift) const 3780 { 3781 int x = 0; 3782 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3783 3784 for ( ; x <= width - 8; x += 8) 3785 { 3786 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 3787 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift)); 3788 vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift)); 3789 } 3790 3791 return x; 3792 } 3793 }; 3794 3795 // from ushort 3796 3797 template <> 3798 struct cvtScale_SIMD<ushort, uchar, float> 3799 { 3800 int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const 3801 { 3802 int x = 0; 3803 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3804 3805 for ( ; x <= width - 8; x += 8) 3806 { 3807 uint16x8_t v_src = vld1q_u16(src + x); 3808 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3809 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3810 3811 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 3812 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 3813 vst1_u8(dst + x, vqmovn_u16(v_dst)); 3814 } 3815 3816 return x; 3817 } 3818 }; 3819 3820 template <> 3821 struct cvtScale_SIMD<ushort, schar, float> 3822 { 3823 int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const 3824 { 3825 int x = 0; 3826 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3827 3828 for ( ; x <= width - 8; x += 8) 3829 { 3830 uint16x8_t v_src = vld1q_u16(src + x); 3831 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3832 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3833 3834 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 3835 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 3836 vst1_s8(dst + x, vqmovn_s16(v_dst)); 3837 } 3838 3839 return x; 3840 } 3841 }; 3842 3843 template <> 3844 struct cvtScale_SIMD<ushort, ushort, float> 3845 { 3846 int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const 3847 { 3848 int x = 0; 3849 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3850 3851 for ( ; x <= width - 8; x += 8) 3852 { 3853 uint16x8_t v_src = vld1q_u16(src + x); 3854 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3855 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3856 3857 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 3858 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 3859 vst1q_u16(dst + x, v_dst); 3860 } 3861 3862 return x; 3863 } 3864 }; 3865 3866 template <> 3867 struct cvtScale_SIMD<ushort, short, float> 3868 { 3869 int operator () (const ushort * src, short * dst, int width, float scale, float shift) const 3870 { 3871 int x = 0; 3872 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3873 3874 for ( ; x <= width - 8; x += 8) 3875 { 3876 uint16x8_t v_src = vld1q_u16(src + x); 3877 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3878 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3879 3880 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 3881 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 3882 vst1q_s16(dst + x, v_dst); 3883 } 3884 3885 return x; 3886 } 3887 }; 3888 3889 template <> 3890 struct cvtScale_SIMD<ushort, int, float> 3891 { 3892 int operator () (const ushort * src, int * dst, int width, float scale, float shift) const 3893 { 3894 int x = 0; 3895 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3896 3897 for ( ; x <= width - 8; x += 8) 3898 { 3899 uint16x8_t v_src = vld1q_u16(src + x); 3900 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); 3901 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); 3902 3903 vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); 3904 vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); 3905 } 3906 3907 return x; 3908 } 3909 }; 3910 3911 template <> 3912 struct cvtScale_SIMD<ushort, float, float> 3913 { 3914 int operator () (const ushort * src, float * dst, int width, float scale, float shift) const 3915 { 3916 int x = 0; 3917 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3918 3919 for ( ; x <= width - 8; x += 8) 3920 { 3921 uint16x8_t v_src = vld1q_u16(src + x); 3922 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift)); 3923 vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift)); 3924 } 3925 3926 return x; 3927 } 3928 }; 3929 3930 // from short 3931 3932 template <> 3933 struct cvtScale_SIMD<short, uchar, float> 3934 { 3935 int operator () (const short * src, uchar * dst, int width, float scale, float shift) const 3936 { 3937 int x = 0; 3938 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3939 3940 for ( ; x <= width - 8; x += 8) 3941 { 3942 int16x8_t v_src = vld1q_s16(src + x); 3943 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 3944 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 3945 3946 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 3947 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 3948 vst1_u8(dst + x, vqmovn_u16(v_dst)); 3949 } 3950 3951 return x; 3952 } 3953 }; 3954 3955 template <> 3956 struct cvtScale_SIMD<short, schar, float> 3957 { 3958 int operator () (const short * src, schar * dst, int width, float scale, float shift) const 3959 { 3960 int x = 0; 3961 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3962 3963 for ( ; x <= width - 8; x += 8) 3964 { 3965 int16x8_t v_src = vld1q_s16(src + x); 3966 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 3967 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 3968 3969 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 3970 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 3971 vst1_s8(dst + x, vqmovn_s16(v_dst)); 3972 } 3973 3974 return x; 3975 } 3976 }; 3977 3978 template <> 3979 struct cvtScale_SIMD<short, ushort, float> 3980 { 3981 int operator () (const short * src, ushort * dst, int width, float scale, float shift) const 3982 { 3983 int x = 0; 3984 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 3985 3986 for ( ; x <= width - 8; x += 8) 3987 { 3988 int16x8_t v_src = vld1q_s16(src + x); 3989 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); 3990 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); 3991 3992 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 3993 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 3994 vst1q_u16(dst + x, v_dst); 3995 } 3996 3997 return x; 3998 } 3999 }; 4000 4001 template <> 4002 struct cvtScale_SIMD<short, float, float> 4003 { 4004 int operator () (const short * src, float * dst, int width, float scale, float shift) const 4005 { 4006 int x = 0; 4007 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4008 4009 for ( ; x <= width - 8; x += 8) 4010 { 4011 int16x8_t v_src = vld1q_s16(src + x); 4012 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift)); 4013 vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift)); 4014 } 4015 4016 return x; 4017 } 4018 }; 4019 4020 // from int 4021 4022 template <> 4023 struct cvtScale_SIMD<int, uchar, float> 4024 { 4025 int operator () (const int * src, uchar * dst, int width, float scale, float shift) const 4026 { 4027 int x = 0; 4028 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4029 4030 for ( ; x <= width - 8; x += 8) 4031 { 4032 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); 4033 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); 4034 4035 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 4036 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 4037 vst1_u8(dst + x, vqmovn_u16(v_dst)); 4038 } 4039 4040 return x; 4041 } 4042 }; 4043 4044 template <> 4045 struct cvtScale_SIMD<int, schar, float> 4046 { 4047 int operator () (const int * src, schar * dst, int width, float scale, float shift) const 4048 { 4049 int x = 0; 4050 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4051 4052 for ( ; x <= width - 8; x += 8) 4053 { 4054 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); 4055 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); 4056 4057 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 4058 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 4059 vst1_s8(dst + x, vqmovn_s16(v_dst)); 4060 } 4061 4062 return x; 4063 } 4064 }; 4065 4066 template <> 4067 struct cvtScale_SIMD<int, ushort, float> 4068 { 4069 int operator () (const int * src, ushort * dst, int width, float scale, float shift) const 4070 { 4071 int x = 0; 4072 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4073 4074 for ( ; x <= width - 8; x += 8) 4075 { 4076 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); 4077 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); 4078 4079 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 4080 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 4081 vst1q_u16(dst + x, v_dst); 4082 } 4083 4084 return x; 4085 } 4086 }; 4087 4088 template <> 4089 struct cvtScale_SIMD<int, short, float> 4090 { 4091 int operator () (const int * src, short * dst, int width, float scale, float shift) const 4092 { 4093 int x = 0; 4094 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4095 4096 for ( ; x <= width - 8; x += 8) 4097 { 4098 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); 4099 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); 4100 4101 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 4102 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 4103 vst1q_s16(dst + x, v_dst); 4104 } 4105 4106 return x; 4107 } 4108 }; 4109 4110 // from float 4111 4112 template <> 4113 struct cvtScale_SIMD<float, uchar, float> 4114 { 4115 int operator () (const float * src, uchar * dst, int width, float scale, float shift) const 4116 { 4117 int x = 0; 4118 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4119 4120 for ( ; x <= width - 8; x += 8) 4121 { 4122 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); 4123 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); 4124 4125 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 4126 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 4127 vst1_u8(dst + x, vqmovn_u16(v_dst)); 4128 } 4129 4130 return x; 4131 } 4132 }; 4133 4134 template <> 4135 struct cvtScale_SIMD<float, schar, float> 4136 { 4137 int operator () (const float * src, schar * dst, int width, float scale, float shift) const 4138 { 4139 int x = 0; 4140 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4141 4142 for ( ; x <= width - 8; x += 8) 4143 { 4144 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); 4145 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); 4146 4147 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 4148 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 4149 vst1_s8(dst + x, vqmovn_s16(v_dst)); 4150 } 4151 4152 return x; 4153 } 4154 }; 4155 4156 template <> 4157 struct cvtScale_SIMD<float, ushort, float> 4158 { 4159 int operator () (const float * src, ushort * dst, int width, float scale, float shift) const 4160 { 4161 int x = 0; 4162 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4163 4164 for ( ; x <= width - 8; x += 8) 4165 { 4166 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); 4167 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); 4168 4169 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), 4170 vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); 4171 vst1q_u16(dst + x, v_dst); 4172 } 4173 4174 return x; 4175 } 4176 }; 4177 4178 template <> 4179 struct cvtScale_SIMD<float, short, float> 4180 { 4181 int operator () (const float * src, short * dst, int width, float scale, float shift) const 4182 { 4183 int x = 0; 4184 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4185 4186 for ( ; x <= width - 8; x += 8) 4187 { 4188 float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); 4189 float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); 4190 4191 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), 4192 vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); 4193 vst1q_s16(dst + x, v_dst); 4194 } 4195 4196 return x; 4197 } 4198 }; 4199 4200 template <> 4201 struct cvtScale_SIMD<float, int, float> 4202 { 4203 int operator () (const float * src, int * dst, int width, float scale, float shift) const 4204 { 4205 int x = 0; 4206 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4207 4208 for ( ; x <= width - 4; x += 4) 4209 vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift))); 4210 4211 return x; 4212 } 4213 }; 4214 4215 template <> 4216 struct cvtScale_SIMD<float, float, float> 4217 { 4218 int operator () (const float * src, float * dst, int width, float scale, float shift) const 4219 { 4220 int x = 0; 4221 float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); 4222 4223 for ( ; x <= width - 4; x += 4) 4224 vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)); 4225 4226 return x; 4227 } 4228 }; 4229 4230 #endif 4231 4232 template<typename T, typename DT, typename WT> static void 4233 cvtScale_( const T* src, size_t sstep, 4234 DT* dst, size_t dstep, Size size, 4235 WT scale, WT shift ) 4236 { 4237 sstep /= sizeof(src[0]); 4238 dstep /= sizeof(dst[0]); 4239 4240 cvtScale_SIMD<T, DT, WT> vop; 4241 4242 for( ; size.height--; src += sstep, dst += dstep ) 4243 { 4244 int x = vop(src, dst, size.width, scale, shift); 4245 4246 #if CV_ENABLE_UNROLLED 4247 for( ; x <= size.width - 4; x += 4 ) 4248 { 4249 DT t0, t1; 4250 t0 = saturate_cast<DT>(src[x]*scale + shift); 4251 t1 = saturate_cast<DT>(src[x+1]*scale + shift); 4252 dst[x] = t0; dst[x+1] = t1; 4253 t0 = saturate_cast<DT>(src[x+2]*scale + shift); 4254 t1 = saturate_cast<DT>(src[x+3]*scale + shift); 4255 dst[x+2] = t0; dst[x+3] = t1; 4256 } 4257 #endif 4258 4259 for( ; x < size.width; x++ ) 4260 dst[x] = saturate_cast<DT>(src[x]*scale + shift); 4261 } 4262 } 4263 4264 //vz optimized template specialization 4265 template<> void 4266 cvtScale_<short, short, float>( const short* src, size_t sstep, 4267 short* dst, size_t dstep, Size size, 4268 float scale, float shift ) 4269 { 4270 sstep /= sizeof(src[0]); 4271 dstep /= sizeof(dst[0]); 4272 4273 for( ; size.height--; src += sstep, dst += dstep ) 4274 { 4275 int x = 0; 4276 #if CV_SSE2 4277 if(USE_SSE2) 4278 { 4279 __m128 scale128 = _mm_set1_ps (scale); 4280 __m128 shift128 = _mm_set1_ps (shift); 4281 for(; x <= size.width - 8; x += 8 ) 4282 { 4283 __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); 4284 __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); 4285 __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); 4286 __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); 4287 rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); 4288 rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); 4289 r0 = _mm_cvtps_epi32(rf0); 4290 r1 = _mm_cvtps_epi32(rf1); 4291 r0 = _mm_packs_epi32(r0, r1); 4292 _mm_storeu_si128((__m128i*)(dst + x), r0); 4293 } 4294 } 4295 #elif CV_NEON 4296 float32x4_t v_shift = vdupq_n_f32(shift); 4297 for(; x <= size.width - 8; x += 8 ) 4298 { 4299 int16x8_t v_src = vld1q_s16(src + x); 4300 float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))); 4301 float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))); 4302 4303 v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift); 4304 v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift); 4305 4306 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)), 4307 vqmovn_s32(cv_vrndq_s32_f32(v_tmp2)))); 4308 } 4309 #endif 4310 4311 for(; x < size.width; x++ ) 4312 dst[x] = saturate_cast<short>(src[x]*scale + shift); 4313 } 4314 } 4315 4316 template<> void 4317 cvtScale_<short, int, float>( const short* src, size_t sstep, 4318 int* dst, size_t dstep, Size size, 4319 float scale, float shift ) 4320 { 4321 sstep /= sizeof(src[0]); 4322 dstep /= sizeof(dst[0]); 4323 4324 for( ; size.height--; src += sstep, dst += dstep ) 4325 { 4326 int x = 0; 4327 4328 #if CV_AVX2 4329 if (USE_AVX2) 4330 { 4331 __m256 scale256 = _mm256_set1_ps(scale); 4332 __m256 shift256 = _mm256_set1_ps(shift); 4333 const int shuffle = 0xD8; 4334 4335 for ( ; x <= size.width - 16; x += 16) 4336 { 4337 __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x)); 4338 v_src = _mm256_permute4x64_epi64(v_src, shuffle); 4339 __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16); 4340 __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16); 4341 __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256); 4342 __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256); 4343 _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); 4344 _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); 4345 } 4346 } 4347 #endif 4348 #if CV_SSE2 4349 if (USE_SSE2)//~5X 4350 { 4351 __m128 scale128 = _mm_set1_ps (scale); 4352 __m128 shift128 = _mm_set1_ps (shift); 4353 for(; x <= size.width - 8; x += 8 ) 4354 { 4355 __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x)); 4356 4357 __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); 4358 __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16)); 4359 rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); 4360 rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); 4361 4362 _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0)); 4363 _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1)); 4364 } 4365 } 4366 #elif CV_NEON 4367 float32x4_t v_shift = vdupq_n_f32(shift); 4368 for(; x <= size.width - 8; x += 8 ) 4369 { 4370 int16x8_t v_src = vld1q_s16(src + x); 4371 float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))); 4372 float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))); 4373 4374 v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift); 4375 v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift); 4376 4377 vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1)); 4378 vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2)); 4379 } 4380 #endif 4381 4382 for(; x < size.width; x++ ) 4383 dst[x] = saturate_cast<int>(src[x]*scale + shift); 4384 } 4385 } 4386 4387 template <typename T, typename DT> 4388 struct Cvt_SIMD 4389 { 4390 int operator() (const T *, DT *, int) const 4391 { 4392 return 0; 4393 } 4394 }; 4395 4396 #if CV_SSE2 4397 4398 // from double 4399 4400 template <> 4401 struct Cvt_SIMD<double, uchar> 4402 { 4403 int operator() (const double * src, uchar * dst, int width) const 4404 { 4405 int x = 0; 4406 4407 if (!USE_SSE2) 4408 return x; 4409 4410 for ( ; x <= width - 8; x += 8) 4411 { 4412 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 4413 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 4414 __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); 4415 __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); 4416 4417 v_src0 = _mm_movelh_ps(v_src0, v_src1); 4418 v_src1 = _mm_movelh_ps(v_src2, v_src3); 4419 4420 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), 4421 _mm_cvtps_epi32(v_src1)); 4422 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst)); 4423 } 4424 4425 return x; 4426 } 4427 }; 4428 4429 template <> 4430 struct Cvt_SIMD<double, schar> 4431 { 4432 int operator() (const double * src, schar * dst, int width) const 4433 { 4434 int x = 0; 4435 4436 if (!USE_SSE2) 4437 return x; 4438 4439 for ( ; x <= width - 8; x += 8) 4440 { 4441 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 4442 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 4443 __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); 4444 __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); 4445 4446 v_src0 = _mm_movelh_ps(v_src0, v_src1); 4447 v_src1 = _mm_movelh_ps(v_src2, v_src3); 4448 4449 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), 4450 _mm_cvtps_epi32(v_src1)); 4451 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst)); 4452 } 4453 4454 return x; 4455 } 4456 }; 4457 4458 #if CV_SSE4_1 4459 4460 template <> 4461 struct Cvt_SIMD<double, ushort> 4462 { 4463 bool haveSIMD; 4464 Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } 4465 4466 int operator() (const double * src, ushort * dst, int width) const 4467 { 4468 int x = 0; 4469 4470 if (!haveSIMD) 4471 return x; 4472 4473 for ( ; x <= width - 8; x += 8) 4474 { 4475 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 4476 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 4477 __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); 4478 __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); 4479 4480 v_src0 = _mm_movelh_ps(v_src0, v_src1); 4481 v_src1 = _mm_movelh_ps(v_src2, v_src3); 4482 4483 __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0), 4484 _mm_cvtps_epi32(v_src1)); 4485 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 4486 } 4487 4488 return x; 4489 } 4490 }; 4491 4492 #endif // CV_SSE4_1 4493 4494 template <> 4495 struct Cvt_SIMD<double, short> 4496 { 4497 int operator() (const double * src, short * dst, int width) const 4498 { 4499 int x = 0; 4500 4501 if (!USE_SSE2) 4502 return x; 4503 4504 for ( ; x <= width - 8; x += 8) 4505 { 4506 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 4507 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 4508 __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); 4509 __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); 4510 4511 v_src0 = _mm_movelh_ps(v_src0, v_src1); 4512 v_src1 = _mm_movelh_ps(v_src2, v_src3); 4513 4514 __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), 4515 _mm_cvtps_epi32(v_src1)); 4516 _mm_storeu_si128((__m128i *)(dst + x), v_dst); 4517 } 4518 4519 return x; 4520 } 4521 }; 4522 4523 template <> 4524 struct Cvt_SIMD<double, int> 4525 { 4526 int operator() (const double * src, int * dst, int width) const 4527 { 4528 int x = 0; 4529 4530 if (!USE_SSE2) 4531 return x; 4532 4533 for ( ; x <= width - 4; x += 4) 4534 { 4535 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 4536 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 4537 v_src0 = _mm_movelh_ps(v_src0, v_src1); 4538 4539 _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0)); 4540 } 4541 4542 return x; 4543 } 4544 }; 4545 4546 template <> 4547 struct Cvt_SIMD<double, float> 4548 { 4549 int operator() (const double * src, float * dst, int width) const 4550 { 4551 int x = 0; 4552 4553 if (!USE_SSE2) 4554 return x; 4555 4556 for ( ; x <= width - 4; x += 4) 4557 { 4558 __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); 4559 __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); 4560 4561 _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1)); 4562 } 4563 4564 return x; 4565 } 4566 }; 4567 4568 4569 #elif CV_NEON 4570 4571 // from uchar 4572 4573 template <> 4574 struct Cvt_SIMD<uchar, schar> 4575 { 4576 int operator() (const uchar * src, schar * dst, int width) const 4577 { 4578 int x = 0; 4579 4580 for ( ; x <= width - 8; x += 8) 4581 vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))))); 4582 4583 return x; 4584 } 4585 }; 4586 4587 4588 template <> 4589 struct Cvt_SIMD<uchar, ushort> 4590 { 4591 int operator() (const uchar * src, ushort * dst, int width) const 4592 { 4593 int x = 0; 4594 4595 for ( ; x <= width - 8; x += 8) 4596 vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x))); 4597 4598 return x; 4599 } 4600 }; 4601 4602 template <> 4603 struct Cvt_SIMD<uchar, short> 4604 { 4605 int operator() (const uchar * src, short * dst, int width) const 4606 { 4607 int x = 0; 4608 4609 for ( ; x <= width - 8; x += 8) 4610 vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))); 4611 4612 return x; 4613 } 4614 }; 4615 4616 template <> 4617 struct Cvt_SIMD<uchar, int> 4618 { 4619 int operator() (const uchar * src, int * dst, int width) const 4620 { 4621 int x = 0; 4622 4623 for ( ; x <= width - 8; x += 8) 4624 { 4625 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 4626 vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)))); 4627 vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)))); 4628 } 4629 4630 return x; 4631 } 4632 }; 4633 4634 template <> 4635 struct Cvt_SIMD<uchar, float> 4636 { 4637 int operator() (const uchar * src, float * dst, int width) const 4638 { 4639 int x = 0; 4640 4641 for ( ; x <= width - 8; x += 8) 4642 { 4643 uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); 4644 vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src)))); 4645 vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src)))); 4646 } 4647 4648 return x; 4649 } 4650 }; 4651 4652 // from schar 4653 4654 template <> 4655 struct Cvt_SIMD<schar, uchar> 4656 { 4657 int operator() (const schar * src, uchar * dst, int width) const 4658 { 4659 int x = 0; 4660 4661 for ( ; x <= width - 8; x += 8) 4662 vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x)))); 4663 4664 return x; 4665 } 4666 }; 4667 4668 template <> 4669 struct Cvt_SIMD<schar, short> 4670 { 4671 int operator() (const schar * src, short * dst, int width) const 4672 { 4673 int x = 0; 4674 4675 for ( ; x <= width - 8; x += 8) 4676 vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x))); 4677 4678 return x; 4679 } 4680 }; 4681 4682 template <> 4683 struct Cvt_SIMD<schar, ushort> 4684 { 4685 int operator() (const schar * src, ushort * dst, int width) const 4686 { 4687 int x = 0; 4688 4689 for ( ; x <= width - 8; x += 8) 4690 { 4691 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 4692 vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))), 4693 vqmovun_s32(vmovl_s16(vget_high_s16(v_src))))); 4694 } 4695 4696 return x; 4697 } 4698 }; 4699 4700 4701 template <> 4702 struct Cvt_SIMD<schar, int> 4703 { 4704 int operator() (const schar * src, int * dst, int width) const 4705 { 4706 int x = 0; 4707 4708 for ( ; x <= width - 8; x += 8) 4709 { 4710 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 4711 vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src))); 4712 vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src))); 4713 } 4714 4715 return x; 4716 } 4717 }; 4718 4719 template <> 4720 struct Cvt_SIMD<schar, float> 4721 { 4722 int operator() (const schar * src, float * dst, int width) const 4723 { 4724 int x = 0; 4725 4726 for ( ; x <= width - 8; x += 8) 4727 { 4728 int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); 4729 vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)))); 4730 vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)))); 4731 } 4732 4733 return x; 4734 } 4735 }; 4736 4737 // from ushort 4738 4739 template <> 4740 struct Cvt_SIMD<ushort, uchar> 4741 { 4742 int operator() (const ushort * src, uchar * dst, int width) const 4743 { 4744 int x = 0; 4745 4746 for ( ; x <= width - 16; x += 16) 4747 { 4748 uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8); 4749 vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2))); 4750 } 4751 4752 return x; 4753 } 4754 }; 4755 4756 template <> 4757 struct Cvt_SIMD<ushort, schar> 4758 { 4759 int operator() (const ushort * src, schar * dst, int width) const 4760 { 4761 int x = 0; 4762 4763 for ( ; x <= width - 16; x += 16) 4764 { 4765 uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8); 4766 int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1))); 4767 int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1))); 4768 int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2))); 4769 int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2))); 4770 4771 vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))), 4772 vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21))))); 4773 } 4774 4775 return x; 4776 } 4777 }; 4778 4779 template <> 4780 struct Cvt_SIMD<ushort, short> 4781 { 4782 int operator() (const ushort * src, short * dst, int width) const 4783 { 4784 int x = 0; 4785 4786 for ( ; x <= width - 8; x += 8) 4787 { 4788 uint16x8_t v_src = vld1q_u16(src + x); 4789 int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))); 4790 int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))); 4791 4792 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1))); 4793 } 4794 4795 return x; 4796 } 4797 }; 4798 4799 template <> 4800 struct Cvt_SIMD<ushort, int> 4801 { 4802 int operator() (const ushort * src, int * dst, int width) const 4803 { 4804 int x = 0; 4805 4806 for ( ; x <= width - 8; x += 8) 4807 { 4808 uint16x8_t v_src = vld1q_u16(src + x); 4809 vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)))); 4810 vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)))); 4811 } 4812 4813 return x; 4814 } 4815 }; 4816 4817 template <> 4818 struct Cvt_SIMD<ushort, float> 4819 { 4820 int operator() (const ushort * src, float * dst, int width) const 4821 { 4822 int x = 0; 4823 4824 for ( ; x <= width - 8; x += 8) 4825 { 4826 uint16x8_t v_src = vld1q_u16(src + x); 4827 vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src)))); 4828 vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src)))); 4829 } 4830 4831 return x; 4832 } 4833 }; 4834 4835 // from short 4836 4837 template <> 4838 struct Cvt_SIMD<short, uchar> 4839 { 4840 int operator() (const short * src, uchar * dst, int width) const 4841 { 4842 int x = 0; 4843 4844 for ( ; x <= width - 16; x += 16) 4845 { 4846 int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8); 4847 vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2))); 4848 } 4849 4850 return x; 4851 } 4852 }; 4853 4854 template <> 4855 struct Cvt_SIMD<short, schar> 4856 { 4857 int operator() (const short * src, schar * dst, int width) const 4858 { 4859 int x = 0; 4860 4861 for ( ; x <= width - 16; x += 16) 4862 { 4863 int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8); 4864 vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2))); 4865 } 4866 4867 return x; 4868 } 4869 }; 4870 4871 template <> 4872 struct Cvt_SIMD<short, ushort> 4873 { 4874 int operator() (const short * src, ushort * dst, int width) const 4875 { 4876 int x = 0; 4877 4878 for ( ; x <= width - 8; x += 8) 4879 { 4880 int16x8_t v_src = vld1q_s16(src + x); 4881 uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src))); 4882 uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src))); 4883 vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); 4884 } 4885 4886 return x; 4887 } 4888 }; 4889 4890 template <> 4891 struct Cvt_SIMD<short, int> 4892 { 4893 int operator() (const short * src, int * dst, int width) const 4894 { 4895 int x = 0; 4896 4897 for ( ; x <= width - 8; x += 8) 4898 { 4899 int16x8_t v_src = vld1q_s16(src + x); 4900 vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src))); 4901 vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src))); 4902 } 4903 4904 return x; 4905 } 4906 }; 4907 4908 template <> 4909 struct Cvt_SIMD<short, float> 4910 { 4911 int operator() (const short * src, float * dst, int width) const 4912 { 4913 int x = 0; 4914 4915 for ( ; x <= width - 8; x += 8) 4916 { 4917 int16x8_t v_src = vld1q_s16(src + x); 4918 vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)))); 4919 vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)))); 4920 } 4921 4922 return x; 4923 } 4924 }; 4925 4926 // from int 4927 4928 template <> 4929 struct Cvt_SIMD<int, uchar> 4930 { 4931 int operator() (const int * src, uchar * dst, int width) const 4932 { 4933 int x = 0; 4934 4935 for ( ; x <= width - 16; x += 16) 4936 { 4937 int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); 4938 int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12); 4939 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2))); 4940 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4))); 4941 vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2)); 4942 } 4943 4944 return x; 4945 } 4946 }; 4947 4948 template <> 4949 struct Cvt_SIMD<int, schar> 4950 { 4951 int operator() (const int * src, schar * dst, int width) const 4952 { 4953 int x = 0; 4954 4955 for ( ; x <= width - 16; x += 16) 4956 { 4957 int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); 4958 int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12); 4959 int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2))); 4960 int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4))); 4961 vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2)); 4962 } 4963 4964 return x; 4965 } 4966 }; 4967 4968 4969 template <> 4970 struct Cvt_SIMD<int, ushort> 4971 { 4972 int operator() (const int * src, ushort * dst, int width) const 4973 { 4974 int x = 0; 4975 4976 for ( ; x <= width - 8; x += 8) 4977 { 4978 int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); 4979 vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2))); 4980 } 4981 4982 return x; 4983 } 4984 }; 4985 4986 template <> 4987 struct Cvt_SIMD<int, short> 4988 { 4989 int operator() (const int * src, short * dst, int width) const 4990 { 4991 int x = 0; 4992 4993 for ( ; x <= width - 8; x += 8) 4994 { 4995 int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4); 4996 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2))); 4997 } 4998 4999 return x; 5000 } 5001 }; 5002 5003 template <> 5004 struct Cvt_SIMD<int, float> 5005 { 5006 int operator() (const int * src, float * dst, int width) const 5007 { 5008 int x = 0; 5009 5010 for ( ; x <= width - 4; x += 4) 5011 vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x))); 5012 5013 return x; 5014 } 5015 }; 5016 5017 // from float 5018 5019 template <> 5020 struct Cvt_SIMD<float, uchar> 5021 { 5022 int operator() (const float * src, uchar * dst, int width) const 5023 { 5024 int x = 0; 5025 5026 for ( ; x <= width - 16; x += 16) 5027 { 5028 uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x)); 5029 uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4)); 5030 uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8)); 5031 uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12)); 5032 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2))); 5033 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4))); 5034 vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2)); 5035 } 5036 5037 return x; 5038 } 5039 }; 5040 5041 template <> 5042 struct Cvt_SIMD<float, schar> 5043 { 5044 int operator() (const float * src, schar * dst, int width) const 5045 { 5046 int x = 0; 5047 5048 for ( ; x <= width - 16; x += 16) 5049 { 5050 int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x)); 5051 int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4)); 5052 int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8)); 5053 int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12)); 5054 int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2))); 5055 int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4))); 5056 vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2)); 5057 } 5058 5059 return x; 5060 } 5061 }; 5062 5063 5064 template <> 5065 struct Cvt_SIMD<float, ushort> 5066 { 5067 int operator() (const float * src, ushort * dst, int width) const 5068 { 5069 int x = 0; 5070 5071 for ( ; x <= width - 8; x += 8) 5072 { 5073 uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x)); 5074 uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4)); 5075 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2))); 5076 } 5077 5078 return x; 5079 } 5080 }; 5081 5082 template <> 5083 struct Cvt_SIMD<float, int> 5084 { 5085 int operator() (const float * src, int * dst, int width) const 5086 { 5087 int x = 0; 5088 5089 for ( ; x <= width - 4; x += 4) 5090 vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x))); 5091 5092 return x; 5093 } 5094 }; 5095 5096 #endif 5097 5098 template<typename T, typename DT> static void 5099 cvt_( const T* src, size_t sstep, 5100 DT* dst, size_t dstep, Size size ) 5101 { 5102 sstep /= sizeof(src[0]); 5103 dstep /= sizeof(dst[0]); 5104 Cvt_SIMD<T, DT> vop; 5105 5106 for( ; size.height--; src += sstep, dst += dstep ) 5107 { 5108 int x = vop(src, dst, size.width); 5109 #if CV_ENABLE_UNROLLED 5110 for( ; x <= size.width - 4; x += 4 ) 5111 { 5112 DT t0, t1; 5113 t0 = saturate_cast<DT>(src[x]); 5114 t1 = saturate_cast<DT>(src[x+1]); 5115 dst[x] = t0; dst[x+1] = t1; 5116 t0 = saturate_cast<DT>(src[x+2]); 5117 t1 = saturate_cast<DT>(src[x+3]); 5118 dst[x+2] = t0; dst[x+3] = t1; 5119 } 5120 #endif 5121 for( ; x < size.width; x++ ) 5122 dst[x] = saturate_cast<DT>(src[x]); 5123 } 5124 } 5125 5126 //vz optimized template specialization, test Core_ConvertScale/ElemWiseTest 5127 template<> void 5128 cvt_<float, short>( const float* src, size_t sstep, 5129 short* dst, size_t dstep, Size size ) 5130 { 5131 sstep /= sizeof(src[0]); 5132 dstep /= sizeof(dst[0]); 5133 5134 for( ; size.height--; src += sstep, dst += dstep ) 5135 { 5136 int x = 0; 5137 #if CV_SSE2 5138 if(USE_SSE2) 5139 { 5140 for( ; x <= size.width - 8; x += 8 ) 5141 { 5142 __m128 src128 = _mm_loadu_ps (src + x); 5143 __m128i src_int128 = _mm_cvtps_epi32 (src128); 5144 5145 src128 = _mm_loadu_ps (src + x + 4); 5146 __m128i src1_int128 = _mm_cvtps_epi32 (src128); 5147 5148 src1_int128 = _mm_packs_epi32(src_int128, src1_int128); 5149 _mm_storeu_si128((__m128i*)(dst + x),src1_int128); 5150 } 5151 } 5152 #elif CV_NEON 5153 for( ; x <= size.width - 8; x += 8 ) 5154 { 5155 float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4); 5156 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)), 5157 vqmovn_s32(cv_vrndq_s32_f32(v_src2))); 5158 vst1q_s16(dst + x, v_dst); 5159 } 5160 #endif 5161 for( ; x < size.width; x++ ) 5162 dst[x] = saturate_cast<short>(src[x]); 5163 } 5164 5165 } 5166 5167 5168 template<typename T> static void 5169 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size ) 5170 { 5171 sstep /= sizeof(src[0]); 5172 dstep /= sizeof(dst[0]); 5173 5174 for( ; size.height--; src += sstep, dst += dstep ) 5175 memcpy(dst, src, size.width*sizeof(src[0])); 5176 } 5177 5178 #define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \ 5179 static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 5180 dtype* dst, size_t dstep, Size size, double* scale) \ 5181 { \ 5182 tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ 5183 } 5184 5185 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \ 5186 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 5187 dtype* dst, size_t dstep, Size size, double* scale) \ 5188 { \ 5189 cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ 5190 } 5191 5192 #if defined(HAVE_IPP) 5193 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \ 5194 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 5195 dtype* dst, size_t dstep, Size size, double*) \ 5196 { \ 5197 CV_IPP_CHECK()\ 5198 {\ 5199 if (src && dst)\ 5200 {\ 5201 if (ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0) \ 5202 {\ 5203 CV_IMPL_ADD(CV_IMPL_IPP)\ 5204 return; \ 5205 }\ 5206 setIppErrorStatus(); \ 5207 }\ 5208 }\ 5209 cvt_(src, sstep, dst, dstep, size); \ 5210 } 5211 5212 #define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \ 5213 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 5214 dtype* dst, size_t dstep, Size size, double*) \ 5215 { \ 5216 CV_IPP_CHECK()\ 5217 {\ 5218 if (src && dst)\ 5219 {\ 5220 if (ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0) \ 5221 {\ 5222 CV_IMPL_ADD(CV_IMPL_IPP)\ 5223 return; \ 5224 }\ 5225 setIppErrorStatus(); \ 5226 }\ 5227 }\ 5228 cvt_(src, sstep, dst, dstep, size); \ 5229 } 5230 #else 5231 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \ 5232 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 5233 dtype* dst, size_t dstep, Size size, double*) \ 5234 { \ 5235 cvt_(src, sstep, dst, dstep, size); \ 5236 } 5237 #define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F 5238 #endif 5239 5240 #define DEF_CVT_FUNC(suffix, stype, dtype) \ 5241 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 5242 dtype* dst, size_t dstep, Size size, double*) \ 5243 { \ 5244 cvt_(src, sstep, dst, dstep, size); \ 5245 } 5246 5247 #define DEF_CPY_FUNC(suffix, stype) \ 5248 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ 5249 stype* dst, size_t dstep, Size size, double*) \ 5250 { \ 5251 cpy_(src, sstep, dst, dstep, size); \ 5252 } 5253 5254 5255 DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float) 5256 DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float) 5257 DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float) 5258 DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float) 5259 DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float) 5260 DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float) 5261 DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float) 5262 5263 DEF_CVT_SCALE_FUNC(8u, uchar, uchar, float) 5264 DEF_CVT_SCALE_FUNC(8s8u, schar, uchar, float) 5265 DEF_CVT_SCALE_FUNC(16u8u, ushort, uchar, float) 5266 DEF_CVT_SCALE_FUNC(16s8u, short, uchar, float) 5267 DEF_CVT_SCALE_FUNC(32s8u, int, uchar, float) 5268 DEF_CVT_SCALE_FUNC(32f8u, float, uchar, float) 5269 DEF_CVT_SCALE_FUNC(64f8u, double, uchar, float) 5270 5271 DEF_CVT_SCALE_FUNC(8u8s, uchar, schar, float) 5272 DEF_CVT_SCALE_FUNC(8s, schar, schar, float) 5273 DEF_CVT_SCALE_FUNC(16u8s, ushort, schar, float) 5274 DEF_CVT_SCALE_FUNC(16s8s, short, schar, float) 5275 DEF_CVT_SCALE_FUNC(32s8s, int, schar, float) 5276 DEF_CVT_SCALE_FUNC(32f8s, float, schar, float) 5277 DEF_CVT_SCALE_FUNC(64f8s, double, schar, float) 5278 5279 DEF_CVT_SCALE_FUNC(8u16u, uchar, ushort, float) 5280 DEF_CVT_SCALE_FUNC(8s16u, schar, ushort, float) 5281 DEF_CVT_SCALE_FUNC(16u, ushort, ushort, float) 5282 DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float) 5283 DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float) 5284 DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float) 5285 DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float) 5286 5287 DEF_CVT_SCALE_FUNC(8u16s, uchar, short, float) 5288 DEF_CVT_SCALE_FUNC(8s16s, schar, short, float) 5289 DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float) 5290 DEF_CVT_SCALE_FUNC(16s, short, short, float) 5291 DEF_CVT_SCALE_FUNC(32s16s, int, short, float) 5292 DEF_CVT_SCALE_FUNC(32f16s, float, short, float) 5293 DEF_CVT_SCALE_FUNC(64f16s, double, short, float) 5294 5295 DEF_CVT_SCALE_FUNC(8u32s, uchar, int, float) 5296 DEF_CVT_SCALE_FUNC(8s32s, schar, int, float) 5297 DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float) 5298 DEF_CVT_SCALE_FUNC(16s32s, short, int, float) 5299 DEF_CVT_SCALE_FUNC(32s, int, int, double) 5300 DEF_CVT_SCALE_FUNC(32f32s, float, int, float) 5301 DEF_CVT_SCALE_FUNC(64f32s, double, int, double) 5302 5303 DEF_CVT_SCALE_FUNC(8u32f, uchar, float, float) 5304 DEF_CVT_SCALE_FUNC(8s32f, schar, float, float) 5305 DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float) 5306 DEF_CVT_SCALE_FUNC(16s32f, short, float, float) 5307 DEF_CVT_SCALE_FUNC(32s32f, int, float, double) 5308 DEF_CVT_SCALE_FUNC(32f, float, float, float) 5309 DEF_CVT_SCALE_FUNC(64f32f, double, float, double) 5310 5311 DEF_CVT_SCALE_FUNC(8u64f, uchar, double, double) 5312 DEF_CVT_SCALE_FUNC(8s64f, schar, double, double) 5313 DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double) 5314 DEF_CVT_SCALE_FUNC(16s64f, short, double, double) 5315 DEF_CVT_SCALE_FUNC(32s64f, int, double, double) 5316 DEF_CVT_SCALE_FUNC(32f64f, float, double, double) 5317 DEF_CVT_SCALE_FUNC(64f, double, double, double) 5318 5319 DEF_CPY_FUNC(8u, uchar) 5320 DEF_CVT_FUNC_F(8s8u, schar, uchar, 8s8u_C1Rs) 5321 DEF_CVT_FUNC_F(16u8u, ushort, uchar, 16u8u_C1R) 5322 DEF_CVT_FUNC_F(16s8u, short, uchar, 16s8u_C1R) 5323 DEF_CVT_FUNC_F(32s8u, int, uchar, 32s8u_C1R) 5324 DEF_CVT_FUNC_F2(32f8u, float, uchar, 32f8u_C1RSfs) 5325 DEF_CVT_FUNC(64f8u, double, uchar) 5326 5327 DEF_CVT_FUNC_F2(8u8s, uchar, schar, 8u8s_C1RSfs) 5328 DEF_CVT_FUNC_F2(16u8s, ushort, schar, 16u8s_C1RSfs) 5329 DEF_CVT_FUNC_F2(16s8s, short, schar, 16s8s_C1RSfs) 5330 DEF_CVT_FUNC_F(32s8s, int, schar, 32s8s_C1R) 5331 DEF_CVT_FUNC_F2(32f8s, float, schar, 32f8s_C1RSfs) 5332 DEF_CVT_FUNC(64f8s, double, schar) 5333 5334 DEF_CVT_FUNC_F(8u16u, uchar, ushort, 8u16u_C1R) 5335 DEF_CVT_FUNC_F(8s16u, schar, ushort, 8s16u_C1Rs) 5336 DEF_CPY_FUNC(16u, ushort) 5337 DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs) 5338 DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs) 5339 DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs) 5340 DEF_CVT_FUNC(64f16u, double, ushort) 5341 5342 DEF_CVT_FUNC_F(8u16s, uchar, short, 8u16s_C1R) 5343 DEF_CVT_FUNC_F(8s16s, schar, short, 8s16s_C1R) 5344 DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs) 5345 DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs) 5346 DEF_CVT_FUNC(32f16s, float, short) 5347 DEF_CVT_FUNC(64f16s, double, short) 5348 5349 DEF_CVT_FUNC_F(8u32s, uchar, int, 8u32s_C1R) 5350 DEF_CVT_FUNC_F(8s32s, schar, int, 8s32s_C1R) 5351 DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R) 5352 DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R) 5353 DEF_CPY_FUNC(32s, int) 5354 DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs) 5355 DEF_CVT_FUNC(64f32s, double, int) 5356 5357 DEF_CVT_FUNC_F(8u32f, uchar, float, 8u32f_C1R) 5358 DEF_CVT_FUNC_F(8s32f, schar, float, 8s32f_C1R) 5359 DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R) 5360 DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R) 5361 DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R) 5362 DEF_CVT_FUNC(64f32f, double, float) 5363 5364 DEF_CVT_FUNC(8u64f, uchar, double) 5365 DEF_CVT_FUNC(8s64f, schar, double) 5366 DEF_CVT_FUNC(16u64f, ushort, double) 5367 DEF_CVT_FUNC(16s64f, short, double) 5368 DEF_CVT_FUNC(32s64f, int, double) 5369 DEF_CVT_FUNC(32f64f, float, double) 5370 DEF_CPY_FUNC(64s, int64) 5371 5372 static BinaryFunc getCvtScaleAbsFunc(int depth) 5373 { 5374 static BinaryFunc cvtScaleAbsTab[] = 5375 { 5376 (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u, 5377 (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u, 5378 (BinaryFunc)cvtScaleAbs64f8u, 0 5379 }; 5380 5381 return cvtScaleAbsTab[depth]; 5382 } 5383 5384 BinaryFunc getConvertFunc(int sdepth, int ddepth) 5385 { 5386 static BinaryFunc cvtTab[][8] = 5387 { 5388 { 5389 (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u), 5390 (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u), 5391 (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 5392 }, 5393 { 5394 (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s), 5395 (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s), 5396 (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 5397 }, 5398 { 5399 (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u, 5400 (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u), 5401 (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 5402 }, 5403 { 5404 (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s), 5405 (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s), 5406 (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 5407 }, 5408 { 5409 (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s), 5410 (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s), 5411 (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 5412 }, 5413 { 5414 (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f), 5415 (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s, 5416 (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 5417 }, 5418 { 5419 (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f), 5420 (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f), 5421 (BinaryFunc)(cvt64s), 0 5422 }, 5423 { 5424 0, 0, 0, 0, 0, 0, 0, 0 5425 } 5426 }; 5427 5428 return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)]; 5429 } 5430 5431 static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth) 5432 { 5433 static BinaryFunc cvtScaleTab[][8] = 5434 { 5435 { 5436 (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u), 5437 (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u), 5438 (BinaryFunc)cvtScale64f8u, 0 5439 }, 5440 { 5441 (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s), 5442 (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s), 5443 (BinaryFunc)cvtScale64f8s, 0 5444 }, 5445 { 5446 (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u), 5447 (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u), 5448 (BinaryFunc)cvtScale64f16u, 0 5449 }, 5450 { 5451 (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s), 5452 (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s), 5453 (BinaryFunc)cvtScale64f16s, 0 5454 }, 5455 { 5456 (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s), 5457 (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s), 5458 (BinaryFunc)cvtScale64f32s, 0 5459 }, 5460 { 5461 (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f), 5462 (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f), 5463 (BinaryFunc)cvtScale64f32f, 0 5464 }, 5465 { 5466 (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f, 5467 (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f, 5468 (BinaryFunc)cvtScale64f, 0 5469 }, 5470 { 5471 0, 0, 0, 0, 0, 0, 0, 0 5472 } 5473 }; 5474 5475 return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)]; 5476 } 5477 5478 #ifdef HAVE_OPENCL 5479 5480 static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) 5481 { 5482 const ocl::Device & d = ocl::Device::getDefault(); 5483 5484 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 5485 bool doubleSupport = d.doubleFPConfig() > 0; 5486 if (!doubleSupport && depth == CV_64F) 5487 return false; 5488 5489 _dst.create(_src.size(), CV_8UC(cn)); 5490 int kercn = 1; 5491 if (d.isIntel()) 5492 { 5493 static const int vectorWidths[] = {4, 4, 4, 4, 4, 4, 4, -1}; 5494 kercn = ocl::checkOptimalVectorWidth( vectorWidths, _src, _dst, 5495 noArray(), noArray(), noArray(), 5496 noArray(), noArray(), noArray(), 5497 noArray(), ocl::OCL_VECTOR_MAX); 5498 } 5499 else 5500 kercn = ocl::predictOptimalVectorWidthMax(_src, _dst); 5501 5502 int rowsPerWI = d.isIntel() ? 4 : 1; 5503 char cvt[2][50]; 5504 int wdepth = std::max(depth, CV_32F); 5505 String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s" 5506 " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s" 5507 " -D workT1=%s -D rowsPerWI=%d%s", 5508 ocl::typeToStr(CV_8UC(kercn)), 5509 ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), 5510 ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth, 5511 ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), 5512 ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]), 5513 ocl::typeToStr(wdepth), rowsPerWI, 5514 doubleSupport ? " -D DOUBLE_SUPPORT" : ""); 5515 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, build_opt); 5516 if (k.empty()) 5517 return false; 5518 5519 UMat src = _src.getUMat(); 5520 UMat dst = _dst.getUMat(); 5521 5522 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), 5523 dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); 5524 5525 if (wdepth == CV_32F) 5526 k.args(srcarg, dstarg, (float)alpha, (float)beta); 5527 else if (wdepth == CV_64F) 5528 k.args(srcarg, dstarg, alpha, beta); 5529 5530 size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI }; 5531 return k.run(2, globalsize, NULL, false); 5532 } 5533 5534 #endif 5535 5536 } 5537 5538 void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) 5539 { 5540 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 5541 ocl_convertScaleAbs(_src, _dst, alpha, beta)) 5542 5543 Mat src = _src.getMat(); 5544 int cn = src.channels(); 5545 double scale[] = {alpha, beta}; 5546 _dst.create( src.dims, src.size, CV_8UC(cn) ); 5547 Mat dst = _dst.getMat(); 5548 BinaryFunc func = getCvtScaleAbsFunc(src.depth()); 5549 CV_Assert( func != 0 ); 5550 5551 if( src.dims <= 2 ) 5552 { 5553 Size sz = getContinuousSize(src, dst, cn); 5554 func( src.ptr(), src.step, 0, 0, dst.ptr(), dst.step, sz, scale ); 5555 } 5556 else 5557 { 5558 const Mat* arrays[] = {&src, &dst, 0}; 5559 uchar* ptrs[2]; 5560 NAryMatIterator it(arrays, ptrs); 5561 Size sz((int)it.size*cn, 1); 5562 5563 for( size_t i = 0; i < it.nplanes; i++, ++it ) 5564 func( ptrs[0], 0, 0, 0, ptrs[1], 0, sz, scale ); 5565 } 5566 } 5567 5568 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const 5569 { 5570 bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON; 5571 5572 if( _type < 0 ) 5573 _type = _dst.fixedType() ? _dst.type() : type(); 5574 else 5575 _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels()); 5576 5577 int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type); 5578 if( sdepth == ddepth && noScale ) 5579 { 5580 copyTo(_dst); 5581 return; 5582 } 5583 5584 Mat src = *this; 5585 5586 BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth); 5587 double scale[] = {alpha, beta}; 5588 int cn = channels(); 5589 CV_Assert( func != 0 ); 5590 5591 if( dims <= 2 ) 5592 { 5593 _dst.create( size(), _type ); 5594 Mat dst = _dst.getMat(); 5595 Size sz = getContinuousSize(src, dst, cn); 5596 func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale ); 5597 } 5598 else 5599 { 5600 _dst.create( dims, size, _type ); 5601 Mat dst = _dst.getMat(); 5602 const Mat* arrays[] = {&src, &dst, 0}; 5603 uchar* ptrs[2]; 5604 NAryMatIterator it(arrays, ptrs); 5605 Size sz((int)(it.size*cn), 1); 5606 5607 for( size_t i = 0; i < it.nplanes; i++, ++it ) 5608 func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, scale); 5609 } 5610 } 5611 5612 /****************************************************************************************\ 5613 * LUT Transform * 5614 \****************************************************************************************/ 5615 5616 namespace cv 5617 { 5618 5619 template<typename T> static void 5620 LUT8u_( const uchar* src, const T* lut, T* dst, int len, int cn, int lutcn ) 5621 { 5622 if( lutcn == 1 ) 5623 { 5624 for( int i = 0; i < len*cn; i++ ) 5625 dst[i] = lut[src[i]]; 5626 } 5627 else 5628 { 5629 for( int i = 0; i < len*cn; i += cn ) 5630 for( int k = 0; k < cn; k++ ) 5631 dst[i+k] = lut[src[i+k]*cn+k]; 5632 } 5633 } 5634 5635 static void LUT8u_8u( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn ) 5636 { 5637 LUT8u_( src, lut, dst, len, cn, lutcn ); 5638 } 5639 5640 static void LUT8u_8s( const uchar* src, const schar* lut, schar* dst, int len, int cn, int lutcn ) 5641 { 5642 LUT8u_( src, lut, dst, len, cn, lutcn ); 5643 } 5644 5645 static void LUT8u_16u( const uchar* src, const ushort* lut, ushort* dst, int len, int cn, int lutcn ) 5646 { 5647 LUT8u_( src, lut, dst, len, cn, lutcn ); 5648 } 5649 5650 static void LUT8u_16s( const uchar* src, const short* lut, short* dst, int len, int cn, int lutcn ) 5651 { 5652 LUT8u_( src, lut, dst, len, cn, lutcn ); 5653 } 5654 5655 static void LUT8u_32s( const uchar* src, const int* lut, int* dst, int len, int cn, int lutcn ) 5656 { 5657 LUT8u_( src, lut, dst, len, cn, lutcn ); 5658 } 5659 5660 static void LUT8u_32f( const uchar* src, const float* lut, float* dst, int len, int cn, int lutcn ) 5661 { 5662 LUT8u_( src, lut, dst, len, cn, lutcn ); 5663 } 5664 5665 static void LUT8u_64f( const uchar* src, const double* lut, double* dst, int len, int cn, int lutcn ) 5666 { 5667 LUT8u_( src, lut, dst, len, cn, lutcn ); 5668 } 5669 5670 typedef void (*LUTFunc)( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn ); 5671 5672 static LUTFunc lutTab[] = 5673 { 5674 (LUTFunc)LUT8u_8u, (LUTFunc)LUT8u_8s, (LUTFunc)LUT8u_16u, (LUTFunc)LUT8u_16s, 5675 (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, 0 5676 }; 5677 5678 #ifdef HAVE_OPENCL 5679 5680 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst) 5681 { 5682 int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth(); 5683 5684 UMat src = _src.getUMat(), lut = _lut.getUMat(); 5685 _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn)); 5686 UMat dst = _dst.getUMat(); 5687 int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_src, _dst)) : dcn; 5688 5689 ocl::Kernel k("LUT", ocl::core::lut_oclsrc, 5690 format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn, 5691 ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth))); 5692 if (k.empty()) 5693 return false; 5694 5695 k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut), 5696 ocl::KernelArg::WriteOnly(dst, dcn, kercn)); 5697 5698 size_t globalSize[2] = { dst.cols * dcn / kercn, (dst.rows + 3) / 4 }; 5699 return k.run(2, globalSize, NULL, false); 5700 } 5701 5702 #endif 5703 5704 #if defined(HAVE_IPP) 5705 namespace ipp { 5706 5707 #if 0 // there are no performance benefits (PR #2653) 5708 class IppLUTParallelBody_LUTC1 : public ParallelLoopBody 5709 { 5710 public: 5711 bool* ok; 5712 const Mat& src_; 5713 const Mat& lut_; 5714 Mat& dst_; 5715 5716 typedef IppStatus (*IppFn)(const Ipp8u* pSrc, int srcStep, void* pDst, int dstStep, 5717 IppiSize roiSize, const void* pTable, int nBitSize); 5718 IppFn fn; 5719 5720 int width; 5721 5722 IppLUTParallelBody_LUTC1(const Mat& src, const Mat& lut, Mat& dst, bool* _ok) 5723 : ok(_ok), src_(src), lut_(lut), dst_(dst) 5724 { 5725 width = dst.cols * dst.channels(); 5726 5727 size_t elemSize1 = CV_ELEM_SIZE1(dst.depth()); 5728 5729 fn = 5730 elemSize1 == 1 ? (IppFn)ippiLUTPalette_8u_C1R : 5731 elemSize1 == 4 ? (IppFn)ippiLUTPalette_8u32u_C1R : 5732 NULL; 5733 5734 *ok = (fn != NULL); 5735 } 5736 5737 void operator()( const cv::Range& range ) const 5738 { 5739 if (!*ok) 5740 return; 5741 5742 const int row0 = range.start; 5743 const int row1 = range.end; 5744 5745 Mat src = src_.rowRange(row0, row1); 5746 Mat dst = dst_.rowRange(row0, row1); 5747 5748 IppiSize sz = { width, dst.rows }; 5749 5750 CV_DbgAssert(fn != NULL); 5751 if (fn(src.data, (int)src.step[0], dst.data, (int)dst.step[0], sz, lut_.data, 8) < 0) 5752 { 5753 setIppErrorStatus(); 5754 *ok = false; 5755 } 5756 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 5757 } 5758 private: 5759 IppLUTParallelBody_LUTC1(const IppLUTParallelBody_LUTC1&); 5760 IppLUTParallelBody_LUTC1& operator=(const IppLUTParallelBody_LUTC1&); 5761 }; 5762 #endif 5763 5764 class IppLUTParallelBody_LUTCN : public ParallelLoopBody 5765 { 5766 public: 5767 bool *ok; 5768 const Mat& src_; 5769 const Mat& lut_; 5770 Mat& dst_; 5771 5772 int lutcn; 5773 5774 uchar* lutBuffer; 5775 uchar* lutTable[4]; 5776 5777 IppLUTParallelBody_LUTCN(const Mat& src, const Mat& lut, Mat& dst, bool* _ok) 5778 : ok(_ok), src_(src), lut_(lut), dst_(dst), lutBuffer(NULL) 5779 { 5780 lutcn = lut.channels(); 5781 IppiSize sz256 = {256, 1}; 5782 5783 size_t elemSize1 = dst.elemSize1(); 5784 CV_DbgAssert(elemSize1 == 1); 5785 lutBuffer = (uchar*)ippMalloc(256 * (int)elemSize1 * 4); 5786 lutTable[0] = lutBuffer + 0; 5787 lutTable[1] = lutBuffer + 1 * 256 * elemSize1; 5788 lutTable[2] = lutBuffer + 2 * 256 * elemSize1; 5789 lutTable[3] = lutBuffer + 3 * 256 * elemSize1; 5790 5791 CV_DbgAssert(lutcn == 3 || lutcn == 4); 5792 if (lutcn == 3) 5793 { 5794 IppStatus status = ippiCopy_8u_C3P3R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256); 5795 if (status < 0) 5796 { 5797 setIppErrorStatus(); 5798 return; 5799 } 5800 CV_IMPL_ADD(CV_IMPL_IPP); 5801 } 5802 else if (lutcn == 4) 5803 { 5804 IppStatus status = ippiCopy_8u_C4P4R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256); 5805 if (status < 0) 5806 { 5807 setIppErrorStatus(); 5808 return; 5809 } 5810 CV_IMPL_ADD(CV_IMPL_IPP); 5811 } 5812 5813 *ok = true; 5814 } 5815 5816 ~IppLUTParallelBody_LUTCN() 5817 { 5818 if (lutBuffer != NULL) 5819 ippFree(lutBuffer); 5820 lutBuffer = NULL; 5821 lutTable[0] = NULL; 5822 } 5823 5824 void operator()( const cv::Range& range ) const 5825 { 5826 if (!*ok) 5827 return; 5828 5829 const int row0 = range.start; 5830 const int row1 = range.end; 5831 5832 Mat src = src_.rowRange(row0, row1); 5833 Mat dst = dst_.rowRange(row0, row1); 5834 5835 if (lutcn == 3) 5836 { 5837 if (ippiLUTPalette_8u_C3R( 5838 src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0], 5839 ippiSize(dst.size()), lutTable, 8) >= 0) 5840 { 5841 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 5842 return; 5843 } 5844 } 5845 else if (lutcn == 4) 5846 { 5847 if (ippiLUTPalette_8u_C4R( 5848 src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0], 5849 ippiSize(dst.size()), lutTable, 8) >= 0) 5850 { 5851 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 5852 return; 5853 } 5854 } 5855 setIppErrorStatus(); 5856 *ok = false; 5857 } 5858 private: 5859 IppLUTParallelBody_LUTCN(const IppLUTParallelBody_LUTCN&); 5860 IppLUTParallelBody_LUTCN& operator=(const IppLUTParallelBody_LUTCN&); 5861 }; 5862 } // namespace ipp 5863 #endif // IPP 5864 5865 class LUTParallelBody : public ParallelLoopBody 5866 { 5867 public: 5868 bool* ok; 5869 const Mat& src_; 5870 const Mat& lut_; 5871 Mat& dst_; 5872 5873 LUTFunc func; 5874 5875 LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok) 5876 : ok(_ok), src_(src), lut_(lut), dst_(dst) 5877 { 5878 func = lutTab[lut.depth()]; 5879 *ok = (func != NULL); 5880 } 5881 5882 void operator()( const cv::Range& range ) const 5883 { 5884 CV_DbgAssert(*ok); 5885 5886 const int row0 = range.start; 5887 const int row1 = range.end; 5888 5889 Mat src = src_.rowRange(row0, row1); 5890 Mat dst = dst_.rowRange(row0, row1); 5891 5892 int cn = src.channels(); 5893 int lutcn = lut_.channels(); 5894 5895 const Mat* arrays[] = {&src, &dst, 0}; 5896 uchar* ptrs[2]; 5897 NAryMatIterator it(arrays, ptrs); 5898 int len = (int)it.size; 5899 5900 for( size_t i = 0; i < it.nplanes; i++, ++it ) 5901 func(ptrs[0], lut_.ptr(), ptrs[1], len, cn, lutcn); 5902 } 5903 private: 5904 LUTParallelBody(const LUTParallelBody&); 5905 LUTParallelBody& operator=(const LUTParallelBody&); 5906 }; 5907 5908 } 5909 5910 void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst ) 5911 { 5912 int cn = _src.channels(), depth = _src.depth(); 5913 int lutcn = _lut.channels(); 5914 5915 CV_Assert( (lutcn == cn || lutcn == 1) && 5916 _lut.total() == 256 && _lut.isContinuous() && 5917 (depth == CV_8U || depth == CV_8S) ); 5918 5919 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, 5920 ocl_LUT(_src, _lut, _dst)) 5921 5922 Mat src = _src.getMat(), lut = _lut.getMat(); 5923 _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn)); 5924 Mat dst = _dst.getMat(); 5925 5926 if (_src.dims() <= 2) 5927 { 5928 bool ok = false; 5929 Ptr<ParallelLoopBody> body; 5930 #if defined(HAVE_IPP) 5931 CV_IPP_CHECK() 5932 { 5933 size_t elemSize1 = CV_ELEM_SIZE1(dst.depth()); 5934 #if 0 // there are no performance benefits (PR #2653) 5935 if (lutcn == 1) 5936 { 5937 ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTC1(src, lut, dst, &ok); 5938 body.reset(p); 5939 } 5940 else 5941 #endif 5942 if ((lutcn == 3 || lutcn == 4) && elemSize1 == 1) 5943 { 5944 ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTCN(src, lut, dst, &ok); 5945 body.reset(p); 5946 } 5947 } 5948 #endif 5949 if (body == NULL || ok == false) 5950 { 5951 ok = false; 5952 ParallelLoopBody* p = new LUTParallelBody(src, lut, dst, &ok); 5953 body.reset(p); 5954 } 5955 if (body != NULL && ok) 5956 { 5957 Range all(0, dst.rows); 5958 if (dst.total()>>18) 5959 parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16)); 5960 else 5961 (*body)(all); 5962 if (ok) 5963 return; 5964 } 5965 } 5966 5967 LUTFunc func = lutTab[lut.depth()]; 5968 CV_Assert( func != 0 ); 5969 5970 const Mat* arrays[] = {&src, &dst, 0}; 5971 uchar* ptrs[2]; 5972 NAryMatIterator it(arrays, ptrs); 5973 int len = (int)it.size; 5974 5975 for( size_t i = 0; i < it.nplanes; i++, ++it ) 5976 func(ptrs[0], lut.ptr(), ptrs[1], len, cn, lutcn); 5977 } 5978 5979 namespace cv { 5980 5981 #ifdef HAVE_OPENCL 5982 5983 static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype, 5984 double scale, double delta ) 5985 { 5986 UMat src = _src.getUMat(); 5987 5988 if( _mask.empty() ) 5989 src.convertTo( _dst, dtype, scale, delta ); 5990 else if (src.channels() <= 4) 5991 { 5992 const ocl::Device & dev = ocl::Device::getDefault(); 5993 5994 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), 5995 ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)), 5996 rowsPerWI = dev.isIntel() ? 4 : 1; 5997 5998 float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta); 5999 bool haveScale = std::fabs(scale - 1) > DBL_EPSILON, 6000 haveZeroScale = !(std::fabs(scale) > DBL_EPSILON), 6001 haveDelta = std::fabs(delta) > DBL_EPSILON, 6002 doubleSupport = dev.doubleFPConfig() > 0; 6003 6004 if (!haveScale && !haveDelta && stype == dtype) 6005 { 6006 _src.copyTo(_dst, _mask); 6007 return true; 6008 } 6009 if (haveZeroScale) 6010 { 6011 _dst.setTo(Scalar(delta), _mask); 6012 return true; 6013 } 6014 6015 if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport) 6016 return false; 6017 6018 char cvt[2][40]; 6019 String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d" 6020 " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s", 6021 ocl::typeToStr(stype), ocl::typeToStr(dtype), 6022 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn, 6023 rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), 6024 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), 6025 doubleSupport ? " -D DOUBLE_SUPPORT" : "", 6026 haveScale ? " -D HAVE_SCALE" : "", 6027 haveDelta ? " -D HAVE_DELTA" : "", 6028 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth)); 6029 6030 ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts); 6031 if (k.empty()) 6032 return false; 6033 6034 UMat mask = _mask.getUMat(), dst = _dst.getUMat(); 6035 6036 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), 6037 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask), 6038 dstarg = ocl::KernelArg::ReadWrite(dst); 6039 6040 if (haveScale) 6041 { 6042 if (haveDelta) 6043 k.args(srcarg, maskarg, dstarg, fscale, fdelta); 6044 else 6045 k.args(srcarg, maskarg, dstarg, fscale); 6046 } 6047 else 6048 { 6049 if (haveDelta) 6050 k.args(srcarg, maskarg, dstarg, fdelta); 6051 else 6052 k.args(srcarg, maskarg, dstarg); 6053 } 6054 6055 size_t globalsize[2] = { src.cols, (src.rows + rowsPerWI - 1) / rowsPerWI }; 6056 return k.run(2, globalsize, NULL, false); 6057 } 6058 else 6059 { 6060 UMat temp; 6061 src.convertTo( temp, dtype, scale, delta ); 6062 temp.copyTo( _dst, _mask ); 6063 } 6064 6065 return true; 6066 } 6067 6068 #endif 6069 6070 } 6071 6072 void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b, 6073 int norm_type, int rtype, InputArray _mask ) 6074 { 6075 double scale = 1, shift = 0; 6076 if( norm_type == CV_MINMAX ) 6077 { 6078 double smin = 0, smax = 0; 6079 double dmin = MIN( a, b ), dmax = MAX( a, b ); 6080 minMaxLoc( _src, &smin, &smax, 0, 0, _mask ); 6081 scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0); 6082 shift = dmin - smin*scale; 6083 } 6084 else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C ) 6085 { 6086 scale = norm( _src, norm_type, _mask ); 6087 scale = scale > DBL_EPSILON ? a/scale : 0.; 6088 shift = 0; 6089 } 6090 else 6091 CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" ); 6092 6093 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 6094 if( rtype < 0 ) 6095 rtype = _dst.fixedType() ? _dst.depth() : depth; 6096 _dst.createSameSize(_src, CV_MAKETYPE(rtype, cn)); 6097 6098 CV_OCL_RUN(_dst.isUMat(), 6099 ocl_normalize(_src, _dst, _mask, rtype, scale, shift)) 6100 6101 Mat src = _src.getMat(), dst = _dst.getMat(); 6102 if( _mask.empty() ) 6103 src.convertTo( dst, rtype, scale, shift ); 6104 else 6105 { 6106 Mat temp; 6107 src.convertTo( temp, rtype, scale, shift ); 6108 temp.copyTo( dst, _mask ); 6109 } 6110 } 6111 6112 CV_IMPL void 6113 cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 ) 6114 { 6115 void* dptrs[] = { dstarr0, dstarr1, dstarr2, dstarr3 }; 6116 cv::Mat src = cv::cvarrToMat(srcarr); 6117 int i, j, nz = 0; 6118 for( i = 0; i < 4; i++ ) 6119 nz += dptrs[i] != 0; 6120 CV_Assert( nz > 0 ); 6121 std::vector<cv::Mat> dvec(nz); 6122 std::vector<int> pairs(nz*2); 6123 6124 for( i = j = 0; i < 4; i++ ) 6125 { 6126 if( dptrs[i] != 0 ) 6127 { 6128 dvec[j] = cv::cvarrToMat(dptrs[i]); 6129 CV_Assert( dvec[j].size() == src.size() ); 6130 CV_Assert( dvec[j].depth() == src.depth() ); 6131 CV_Assert( dvec[j].channels() == 1 ); 6132 CV_Assert( i < src.channels() ); 6133 pairs[j*2] = i; 6134 pairs[j*2+1] = j; 6135 j++; 6136 } 6137 } 6138 if( nz == src.channels() ) 6139 cv::split( src, dvec ); 6140 else 6141 { 6142 cv::mixChannels( &src, 1, &dvec[0], nz, &pairs[0], nz ); 6143 } 6144 } 6145 6146 6147 CV_IMPL void 6148 cvMerge( const void* srcarr0, const void* srcarr1, const void* srcarr2, 6149 const void* srcarr3, void* dstarr ) 6150 { 6151 const void* sptrs[] = { srcarr0, srcarr1, srcarr2, srcarr3 }; 6152 cv::Mat dst = cv::cvarrToMat(dstarr); 6153 int i, j, nz = 0; 6154 for( i = 0; i < 4; i++ ) 6155 nz += sptrs[i] != 0; 6156 CV_Assert( nz > 0 ); 6157 std::vector<cv::Mat> svec(nz); 6158 std::vector<int> pairs(nz*2); 6159 6160 for( i = j = 0; i < 4; i++ ) 6161 { 6162 if( sptrs[i] != 0 ) 6163 { 6164 svec[j] = cv::cvarrToMat(sptrs[i]); 6165 CV_Assert( svec[j].size == dst.size && 6166 svec[j].depth() == dst.depth() && 6167 svec[j].channels() == 1 && i < dst.channels() ); 6168 pairs[j*2] = j; 6169 pairs[j*2+1] = i; 6170 j++; 6171 } 6172 } 6173 6174 if( nz == dst.channels() ) 6175 cv::merge( svec, dst ); 6176 else 6177 { 6178 cv::mixChannels( &svec[0], nz, &dst, 1, &pairs[0], nz ); 6179 } 6180 } 6181 6182 6183 CV_IMPL void 6184 cvMixChannels( const CvArr** src, int src_count, 6185 CvArr** dst, int dst_count, 6186 const int* from_to, int pair_count ) 6187 { 6188 cv::AutoBuffer<cv::Mat> buf(src_count + dst_count); 6189 6190 int i; 6191 for( i = 0; i < src_count; i++ ) 6192 buf[i] = cv::cvarrToMat(src[i]); 6193 for( i = 0; i < dst_count; i++ ) 6194 buf[i+src_count] = cv::cvarrToMat(dst[i]); 6195 cv::mixChannels(&buf[0], src_count, &buf[src_count], dst_count, from_to, pair_count); 6196 } 6197 6198 CV_IMPL void 6199 cvConvertScaleAbs( const void* srcarr, void* dstarr, 6200 double scale, double shift ) 6201 { 6202 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 6203 CV_Assert( src.size == dst.size && dst.type() == CV_8UC(src.channels())); 6204 cv::convertScaleAbs( src, dst, scale, shift ); 6205 } 6206 6207 CV_IMPL void 6208 cvConvertScale( const void* srcarr, void* dstarr, 6209 double scale, double shift ) 6210 { 6211 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 6212 6213 CV_Assert( src.size == dst.size && src.channels() == dst.channels() ); 6214 src.convertTo(dst, dst.type(), scale, shift); 6215 } 6216 6217 CV_IMPL void cvLUT( const void* srcarr, void* dstarr, const void* lutarr ) 6218 { 6219 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), lut = cv::cvarrToMat(lutarr); 6220 6221 CV_Assert( dst.size() == src.size() && dst.type() == CV_MAKETYPE(lut.depth(), src.channels()) ); 6222 cv::LUT( src, lut, dst ); 6223 } 6224 6225 CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr, 6226 double a, double b, int norm_type, const CvArr* maskarr ) 6227 { 6228 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask; 6229 if( maskarr ) 6230 mask = cv::cvarrToMat(maskarr); 6231 CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() ); 6232 cv::normalize( src, dst, a, b, norm_type, dst.type(), mask ); 6233 } 6234 6235 /* End of file. */ 6236