1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 16 // Copyright (C) 2015, Itseez Inc., all rights reserved. 17 // Third party copyrights are property of their respective owners. 18 // 19 // Redistribution and use in source and binary forms, with or without modification, 20 // are permitted provided that the following conditions are met: 21 // 22 // * Redistribution's of source code must retain the above copyright notice, 23 // this list of conditions and the following disclaimer. 24 // 25 // * Redistribution's in binary form must reproduce the above copyright notice, 26 // this list of conditions and the following disclaimer in the documentation 27 // and/or other materials provided with the distribution. 28 // 29 // * The name of the copyright holders may not be used to endorse or promote products 30 // derived from this software without specific prior written permission. 31 // 32 // This software is provided by the copyright holders and contributors "as is" and 33 // any express or implied warranties, including, but not limited to, the implied 34 // warranties of merchantability and fitness for a particular purpose are disclaimed. 35 // In no event shall the Intel Corporation or contributors be liable for any direct, 36 // indirect, incidental, special, exemplary, or consequential damages 37 // (including, but not limited to, procurement of substitute goods or services; 38 // loss of use, data, or profits; or business interruption) however caused 39 // and on any theory of liability, whether in contract, strict liability, 40 // or tort (including negligence or otherwise) arising in any way out of 41 // the use of this software, even if advised of the possibility of such damage. 42 // 43 //M*/ 44 45 #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__ 46 #define __OPENCV_HAL_INTRIN_CPP_HPP__ 47 48 namespace cv 49 { 50 51 template<typename _Tp, int n> struct v_reg 52 { 53 typedef _Tp lane_type; 54 typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec; 55 typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec; 56 enum { nlanes = n }; 57 58 explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; } 59 v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; } 60 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } 61 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, 62 _Tp s4, _Tp s5, _Tp s6, _Tp s7) 63 { 64 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; 65 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; 66 } 67 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, 68 _Tp s4, _Tp s5, _Tp s6, _Tp s7, 69 _Tp s8, _Tp s9, _Tp s10, _Tp s11, 70 _Tp s12, _Tp s13, _Tp s14, _Tp s15) 71 { 72 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; 73 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; 74 s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11; 75 s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15; 76 } 77 78 v_reg() {} 79 v_reg(const v_reg<_Tp, n> & r) 80 { 81 for( int i = 0; i < n; i++ ) 82 s[i] = r.s[i]; 83 } 84 85 _Tp get(const int i) const { return s[i]; } 86 _Tp get0() const { return s[0]; } 87 v_reg<_Tp, n> high() const 88 { 89 v_reg<_Tp, n> c; 90 int i; 91 for( i = 0; i < n/2; i++ ) 92 { 93 c.s[i] = s[i+(n/2)]; 94 c.s[i+(n/2)] = 0; 95 } 96 return c; 97 } 98 99 static v_reg<_Tp, n> zero() 100 { 101 v_reg<_Tp, n> c; 102 for( int i = 0; i < n; i++ ) 103 c.s[i] = (_Tp)0; 104 return c; 105 } 106 107 static v_reg<_Tp, n> all(_Tp s) 108 { 109 v_reg<_Tp, n> c; 110 for( int i = 0; i < n; i++ ) 111 c.s[i] = s; 112 return c; 113 } 114 115 template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const 116 { 117 size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n); 118 v_reg<_Tp2, n2> c; 119 memcpy(&c.s[0], &s[0], bytes); 120 return c; 121 } 122 123 _Tp s[n]; 124 }; 125 126 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \ 127 template<typename _Tp, int n> inline v_reg<_Tp, n> \ 128 operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 129 { \ 130 v_reg<_Tp, n> c; \ 131 for( int i = 0; i < n; i++ ) \ 132 c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ 133 return c; \ 134 } \ 135 template<typename _Tp, int n> inline v_reg<_Tp, n>& \ 136 operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 137 { \ 138 for( int i = 0; i < n; i++ ) \ 139 a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ 140 return a; \ 141 } 142 143 OPENCV_HAL_IMPL_BIN_OP(+) 144 OPENCV_HAL_IMPL_BIN_OP(-) 145 OPENCV_HAL_IMPL_BIN_OP(*) 146 OPENCV_HAL_IMPL_BIN_OP(/) 147 148 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \ 149 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \ 150 (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 151 { \ 152 v_reg<_Tp, n> c; \ 153 typedef typename V_TypeTraits<_Tp>::int_type itype; \ 154 for( int i = 0; i < n; i++ ) \ 155 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ 156 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ 157 return c; \ 158 } \ 159 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \ 160 bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 161 { \ 162 typedef typename V_TypeTraits<_Tp>::int_type itype; \ 163 for( int i = 0; i < n; i++ ) \ 164 a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ 165 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ 166 return a; \ 167 } 168 169 OPENCV_HAL_IMPL_BIT_OP(&) 170 OPENCV_HAL_IMPL_BIT_OP(|) 171 OPENCV_HAL_IMPL_BIT_OP(^) 172 173 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) 174 { 175 v_reg<_Tp, n> c; 176 for( int i = 0; i < n; i++ ) 177 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); 178 return c; 179 } 180 181 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \ 182 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \ 183 { \ 184 v_reg<_Tp2, n> c; \ 185 for( int i = 0; i < n; i++ ) \ 186 c.s[i] = cfunc(a.s[i]); \ 187 return c; \ 188 } 189 190 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) 191 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) 192 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) 193 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) 194 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) 195 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs, 196 typename V_TypeTraits<_Tp>::abs_type) 197 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int) 198 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int) 199 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int) 200 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int) 201 202 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \ 203 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 204 { \ 205 v_reg<_Tp, n> c; \ 206 for( int i = 0; i < n; i++ ) \ 207 c.s[i] = cfunc(a.s[i], b.s[i]); \ 208 return c; \ 209 } \ 210 template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \ 211 { \ 212 _Tp c = a.s[0]; \ 213 for( int i = 1; i < n; i++ ) \ 214 c = cfunc(c, a.s[i]); \ 215 return c; \ 216 } 217 218 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min) 219 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max) 220 221 template<typename _Tp, int n> 222 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 223 v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval ) 224 { 225 for( int i = 0; i < n; i++ ) 226 { 227 minval.s[i] = std::min(a.s[i], b.s[i]); 228 maxval.s[i] = std::max(a.s[i], b.s[i]); 229 } 230 } 231 232 233 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ 234 template<typename _Tp, int n> \ 235 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 236 { \ 237 typedef typename V_TypeTraits<_Tp>::int_type itype; \ 238 v_reg<_Tp, n> c; \ 239 for( int i = 0; i < n; i++ ) \ 240 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \ 241 return c; \ 242 } 243 244 OPENCV_HAL_IMPL_CMP_OP(<) 245 OPENCV_HAL_IMPL_CMP_OP(>) 246 OPENCV_HAL_IMPL_CMP_OP(<=) 247 OPENCV_HAL_IMPL_CMP_OP(>=) 248 OPENCV_HAL_IMPL_CMP_OP(==) 249 OPENCV_HAL_IMPL_CMP_OP(!=) 250 251 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \ 252 template<typename _Tp, int n> \ 253 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 254 { \ 255 typedef _Tp2 rtype; \ 256 v_reg<rtype, n> c; \ 257 for( int i = 0; i < n; i++ ) \ 258 c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \ 259 return c; \ 260 } 261 262 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp) 263 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp) 264 OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type) 265 266 template<typename _Tp, int n> 267 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) 268 { 269 v_reg<_Tp, n> c; 270 for( int i = 0; i < n; i++ ) 271 c.s[i] = 1.f/std::sqrt(a.s[i]); 272 return c; 273 } 274 275 template<typename _Tp, int n> 276 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 277 { 278 v_reg<_Tp, n> c; 279 for( int i = 0; i < n; i++ ) 280 c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]); 281 return c; 282 } 283 284 285 template<typename _Tp, int n> 286 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 287 { 288 v_reg<_Tp, n> c; 289 for( int i = 0; i < n; i++ ) 290 c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i]; 291 return c; 292 } 293 294 template<typename _Tp, int n> 295 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 296 const v_reg<_Tp, n>& c) 297 { 298 v_reg<_Tp, n> d; 299 for( int i = 0; i < n; i++ ) 300 d.s[i] = a.s[i]*b.s[i] + c.s[i]; 301 return d; 302 } 303 304 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> 305 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 306 { 307 typedef typename V_TypeTraits<_Tp>::w_type w_type; 308 v_reg<w_type, n/2> c; 309 for( int i = 0; i < (n/2); i++ ) 310 c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1]; 311 return c; 312 } 313 314 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 315 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c, 316 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d) 317 { 318 typedef typename V_TypeTraits<_Tp>::w_type w_type; 319 for( int i = 0; i < (n/2); i++ ) 320 { 321 c.s[i] = (w_type)a.s[i]*b.s[i]*2; 322 d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)]; 323 } 324 } 325 326 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a, 327 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c) 328 { 329 typedef typename V_TypeTraits<_Tp>::w_type w_type; 330 for( int i = 0; i < (n/2); i++ ) 331 { 332 c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1]; 333 } 334 } 335 336 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ 337 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ 338 { \ 339 v_reg<_Tp, n> c; \ 340 for( int i = 0; i < n; i++ ) \ 341 c.s[i] = (_Tp)(a.s[i] shift_op imm); \ 342 return c; \ 343 } 344 345 OPENCV_HAL_IMPL_SHIFT_OP(<<) 346 OPENCV_HAL_IMPL_SHIFT_OP(>>) 347 348 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) 349 { 350 typename V_TypeTraits<_Tp>::sum_type c = a.s[0]; 351 for( int i = 1; i < n; i++ ) 352 c += a.s[i]; 353 return c; 354 } 355 356 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a) 357 { 358 int mask = 0; 359 for( int i = 0; i < n; i++ ) 360 mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i; 361 return mask; 362 } 363 364 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a) 365 { 366 for( int i = 0; i < n; i++ ) 367 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 ) 368 return false; 369 return true; 370 } 371 372 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a) 373 { 374 for( int i = 0; i < n; i++ ) 375 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 ) 376 return true; 377 return false; 378 } 379 380 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, 381 const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 382 { 383 v_reg<_Tp, n> c; 384 for( int i = 0; i < n; i++ ) 385 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i]; 386 return c; 387 } 388 389 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a, 390 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0, 391 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1) 392 { 393 for( int i = 0; i < (n/2); i++ ) 394 { 395 b0.s[i] = a.s[i]; 396 b1.s[i] = a.s[i+(n/2)]; 397 } 398 } 399 400 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n> 401 v_reinterpret_as_int(const v_reg<_Tp, n>& a) 402 { 403 v_reg<typename V_TypeTraits<_Tp>::int_type, n> c; 404 for( int i = 0; i < n; i++ ) 405 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]); 406 return c; 407 } 408 409 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n> 410 v_reinterpret_as_uint(const v_reg<_Tp, n>& a) 411 { 412 v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c; 413 for( int i = 0; i < n; i++ ) 414 c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]); 415 return c; 416 } 417 418 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, 419 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ) 420 { 421 int i; 422 for( i = 0; i < n/2; i++ ) 423 { 424 b0.s[i*2] = a0.s[i]; 425 b0.s[i*2+1] = a1.s[i]; 426 } 427 for( ; i < n; i++ ) 428 { 429 b1.s[i*2-n] = a0.s[i]; 430 b1.s[i*2-n+1] = a1.s[i]; 431 } 432 } 433 434 template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr) 435 { 436 return v_reg<_Tp, n>(ptr); 437 } 438 439 template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr) 440 { 441 return v_reg<_Tp, n>(ptr); 442 } 443 444 template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr) 445 { 446 v_reg<_Tp, n> c; 447 for( int i = 0; i < n/2; i++ ) 448 { 449 c.s[i] = loptr[i]; 450 c.s[i+n/2] = hiptr[i]; 451 } 452 return c; 453 } 454 455 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr) 456 { 457 typedef typename V_TypeTraits<_Tp>::w_type w_type; 458 v_reg<w_type, n> c; 459 for( int i = 0; i < n; i++ ) 460 { 461 c.s[i] = ptr[i]; 462 } 463 return c; 464 } 465 466 template<typename _Tp, int n> inline v_reg<typename 467 V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr) 468 { 469 typedef typename V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type w_type; 470 v_reg<w_type, n> c; 471 for( int i = 0; i < n; i++ ) 472 { 473 c.s[i] = ptr[i]; 474 } 475 return c; 476 } 477 478 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, 479 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c) 480 { 481 int i, i3; 482 for( i = i3 = 0; i < n; i++, i3 += 3 ) 483 { 484 a.s[i] = ptr[i3]; 485 b.s[i] = ptr[i3+1]; 486 c.s[i] = ptr[i3+2]; 487 } 488 } 489 490 template<typename _Tp, int n> 491 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, 492 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, 493 v_reg<_Tp, n>& d) 494 { 495 int i, i4; 496 for( i = i4 = 0; i < n; i++, i4 += 4 ) 497 { 498 a.s[i] = ptr[i4]; 499 b.s[i] = ptr[i4+1]; 500 c.s[i] = ptr[i4+2]; 501 d.s[i] = ptr[i4+3]; 502 } 503 } 504 505 template<typename _Tp, int n> 506 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, 507 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) 508 { 509 int i, i3; 510 for( i = i3 = 0; i < n; i++, i3 += 3 ) 511 { 512 ptr[i3] = a.s[i]; 513 ptr[i3+1] = b.s[i]; 514 ptr[i3+2] = c.s[i]; 515 } 516 } 517 518 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, 519 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, 520 const v_reg<_Tp, n>& d) 521 { 522 int i, i4; 523 for( i = i4 = 0; i < n; i++, i4 += 4 ) 524 { 525 ptr[i4] = a.s[i]; 526 ptr[i4+1] = b.s[i]; 527 ptr[i4+2] = c.s[i]; 528 ptr[i4+3] = d.s[i]; 529 } 530 } 531 532 template<typename _Tp, int n> 533 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) 534 { 535 for( int i = 0; i < n; i++ ) 536 ptr[i] = a.s[i]; 537 } 538 539 template<typename _Tp, int n> 540 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) 541 { 542 for( int i = 0; i < (n/2); i++ ) 543 ptr[i] = a.s[i]; 544 } 545 546 template<typename _Tp, int n> 547 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) 548 { 549 for( int i = 0; i < (n/2); i++ ) 550 ptr[i] = a.s[i+(n/2)]; 551 } 552 553 template<typename _Tp, int n> 554 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) 555 { 556 for( int i = 0; i < n; i++ ) 557 ptr[i] = a.s[i]; 558 } 559 560 template<typename _Tp, int n> 561 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 562 { 563 v_reg<_Tp, n> c; 564 for( int i = 0; i < (n/2); i++ ) 565 { 566 c.s[i] = a.s[i]; 567 c.s[i+(n/2)] = b.s[i]; 568 } 569 } 570 571 template<typename _Tp, int n> 572 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 573 { 574 v_reg<_Tp, n> c; 575 for( int i = 0; i < (n/2); i++ ) 576 { 577 c.s[i] = a.s[i+(n/2)]; 578 c.s[i+(n/2)] = b.s[i+(n/2)]; 579 } 580 } 581 582 template<typename _Tp, int n> 583 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 584 v_reg<_Tp, n>& low, v_reg<_Tp, n>& high) 585 { 586 for( int i = 0; i < (n/2); i++ ) 587 { 588 low.s[i] = a.s[i]; 589 low.s[i+(n/2)] = b.s[i]; 590 high.s[i] = a.s[i+(n/2)]; 591 high.s[i+(n/2)] = b.s[i+(n/2)]; 592 } 593 } 594 595 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a) 596 { 597 v_reg<int, n> c; 598 for( int i = 0; i < n; i++ ) 599 c.s[i] = cvRound(a.s[i]); 600 return c; 601 } 602 603 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a) 604 { 605 v_reg<int, n> c; 606 for( int i = 0; i < n; i++ ) 607 c.s[i] = cvFloor(a.s[i]); 608 return c; 609 } 610 611 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a) 612 { 613 v_reg<int, n> c; 614 for( int i = 0; i < n; i++ ) 615 c.s[i] = cvCeil(a.s[i]); 616 return c; 617 } 618 619 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a) 620 { 621 v_reg<int, n> c; 622 for( int i = 0; i < n; i++ ) 623 c.s[i] = (int)(a.s[i]); 624 return c; 625 } 626 627 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a) 628 { 629 v_reg<int, n*2> c; 630 for( int i = 0; i < n; i++ ) 631 { 632 c.s[i] = cvRound(a.s[i]); 633 c.s[i+n] = 0; 634 } 635 return c; 636 } 637 638 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a) 639 { 640 v_reg<int, n> c; 641 for( int i = 0; i < n; i++ ) 642 { 643 c.s[i] = cvFloor(a.s[i]); 644 c.s[i+n] = 0; 645 } 646 return c; 647 } 648 649 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a) 650 { 651 v_reg<int, n> c; 652 for( int i = 0; i < n; i++ ) 653 { 654 c.s[i] = cvCeil(a.s[i]); 655 c.s[i+n] = 0; 656 } 657 return c; 658 } 659 660 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a) 661 { 662 v_reg<int, n> c; 663 for( int i = 0; i < n; i++ ) 664 { 665 c.s[i] = cvCeil(a.s[i]); 666 c.s[i+n] = 0; 667 } 668 return c; 669 } 670 671 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a) 672 { 673 v_reg<float, n> c; 674 for( int i = 0; i < n; i++ ) 675 c.s[i] = (float)a.s[i]; 676 return c; 677 } 678 679 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a) 680 { 681 v_reg<double, n> c; 682 for( int i = 0; i < n; i++ ) 683 c.s[i] = (double)a.s[i]; 684 return c; 685 } 686 687 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a) 688 { 689 v_reg<double, n> c; 690 for( int i = 0; i < n; i++ ) 691 c.s[i] = (double)a.s[i]; 692 return c; 693 } 694 695 template<typename _Tp> 696 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, 697 const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, 698 v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, 699 v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 ) 700 { 701 b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]); 702 b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]); 703 b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]); 704 b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]); 705 } 706 707 typedef v_reg<uchar, 16> v_uint8x16; 708 typedef v_reg<schar, 16> v_int8x16; 709 typedef v_reg<ushort, 8> v_uint16x8; 710 typedef v_reg<short, 8> v_int16x8; 711 typedef v_reg<unsigned, 4> v_uint32x4; 712 typedef v_reg<int, 4> v_int32x4; 713 typedef v_reg<float, 4> v_float32x4; 714 typedef v_reg<float, 8> v_float32x8; 715 typedef v_reg<double, 2> v_float64x2; 716 typedef v_reg<uint64, 2> v_uint64x2; 717 typedef v_reg<int64, 2> v_int64x2; 718 719 #define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \ 720 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \ 721 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \ 722 template<typename _Tp0, int n0> inline _Tpvec \ 723 v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \ 724 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); } 725 726 OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8) 727 OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8) 728 OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16) 729 OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16) 730 OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32) 731 OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32) 732 OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32) 733 OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64) 734 OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64) 735 OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64) 736 737 #define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \ 738 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \ 739 { return a << n; } \ 740 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \ 741 { return a >> n; } \ 742 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \ 743 { \ 744 _Tpvec c; \ 745 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 746 c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 747 return c; \ 748 } 749 750 OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort) 751 OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short) 752 OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned) 753 OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int) 754 OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64) 755 OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64) 756 757 758 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ 759 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ 760 { \ 761 _Tpnvec c; \ 762 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 763 { \ 764 c.s[i] = saturate_cast<_Tpn>(a.s[i]); \ 765 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \ 766 } \ 767 return c; \ 768 } \ 769 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ 770 { \ 771 _Tpnvec c; \ 772 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 773 { \ 774 c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 775 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 776 } \ 777 return c; \ 778 } \ 779 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ 780 { \ 781 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 782 ptr[i] = saturate_cast<_Tpn>(a.s[i]); \ 783 } \ 784 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ 785 { \ 786 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 787 ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 788 } 789 790 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack) 791 OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack) 792 OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u) 793 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack) 794 OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack) 795 OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u) 796 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack) 797 OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack) 798 799 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, 800 const v_float32x4& m1, const v_float32x4& m2, 801 const v_float32x4& m3) 802 { 803 return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0], 804 v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1], 805 v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2], 806 v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]); 807 } 808 809 } 810 811 #endif 812